#ifndef EBWT_H_
#define EBWT_H_

#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <vector>
#include <set>
#include <sstream>
#include <fcntl.h>
#include <errno.h>
#include <stdexcept>
#include <seqan/sequence.h>
#include <seqan/index.h>
#include <sys/stat.h>
#ifdef BOWTIE_MM
#include <sys/mman.h>
#include <sys/shm.h>
#endif
#include "auto_array.h"
#include "shmem.h"
#include "alphabet.h"
#include "assert_helpers.h"
#include "bitpack.h"
#include "blockwise_sa.h"
#include "endian_swap.h"
#include "word_io.h"
#include "random_source.h"
#include "hit.h"
#include "ref_read.h"
#include "threading.h"
#include "bitset.h"
#include "str_util.h"
#include "mm.h"
#include "timer.h"
#include "color_dec.h"
#include "reference.h"

#ifdef POPCNT_CAPABILITY 
    #include "processor_support.h" 
#endif 

using namespace std;
using namespace seqan;

#ifndef PREFETCH_LOCALITY
// No locality by default
#define PREFETCH_LOCALITY 2
#endif

// From ccnt_lut.cpp, automatically generated by gen_lookup_tables.pl
extern uint8_t cCntLUT_4[4][4][256];

static const uint64_t c_table[4] = {
	0xffffffffffffffffllu,
	0xaaaaaaaaaaaaaaaallu,
	0x5555555555555555llu,
	0x0000000000000000llu
};

#ifndef VMSG_NL
#define VMSG_NL(args...) \
if(this->verbose()) { \
	stringstream tmp; \
	tmp << args << endl; \
	this->verbose(tmp.str()); \
}
#endif

#ifndef VMSG
#define VMSG(args...) \
if(this->verbose()) { \
	stringstream tmp; \
	tmp << args; \
	this->verbose(tmp.str()); \
}
#endif

/**
 * Flags describing type of Ebwt.
 */
enum EBWT_FLAGS {
	EBWT_COLOR = 2,     // true -> Ebwt is colorspace
	EBWT_ENTIRE_REV = 4 // true -> reverse Ebwt is the whole
	                    // concatenated string reversed, rather than
	                    // each stretch reversed
};

extern string gLastIOErrMsg;

inline bool is_read_err(int fdesc, ssize_t ret, size_t count){
	if (ret < 0) {
		std::stringstream sstm;
		sstm << "ERRNO: " << errno << " ERR Msg:" << strerror(errno) << std::endl;
		gLastIOErrMsg = sstm.str();
		return true;
	}
	return false;
}

/* Checks whether a call to fread() failed or not. */
inline bool is_fread_err(FILE* file_hd, size_t ret, size_t count){
	if (ferror(file_hd)) {
		gLastIOErrMsg = "Error Reading File!";
		return true;
	}
	return false;
}

/**
 * Extended Burrows-Wheeler transform header.  This together with the
 * actual data arrays and other text-specific parameters defined in
 * class Ebwt constitute the entire Ebwt.
 */
class EbwtParams {

public:
	EbwtParams() { }

	EbwtParams(TIndexOffU len,
	           int32_t lineRate,
	           int32_t linesPerSide,
	           int32_t offRate,
	           int32_t isaRate,
	           int32_t ftabChars,
	           bool color,
	           bool entireReverse,
	           bool isBt2Index)
	{
		init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireReverse, isBt2Index);
	}

	EbwtParams(const EbwtParams& eh) {
		init(eh._len, eh._lineRate, eh._linesPerSide, eh._offRate,
		     eh._isaRate, eh._ftabChars, eh._color, eh._entireReverse, eh._isBt2Index);
	}

	void init(TIndexOffU len, int32_t lineRate, int32_t linesPerSide,
	          int32_t offRate, int32_t isaRate, int32_t ftabChars,
	          bool color, bool entireReverse, bool isBt2Index)
	{
		_color = color;
		_isBt2Index = isBt2Index;
		_entireReverse = entireReverse;
		_len = len;
		_bwtLen = _len + 1;
		_sz = (len+3)/4;
		_bwtSz = (len/4 + 1);
		_lineRate = lineRate;
		_linesPerSide = _isBt2Index ? 1 : linesPerSide;
		_origOffRate = offRate;
		_offRate = offRate;
		_offMask = OFF_MASK << _offRate;
		_isaRate = isaRate;
		_isaMask = OFF_MASK << ((_isaRate >= 0) ? _isaRate : 0);
		_ftabChars = ftabChars;
		_eftabLen = _ftabChars*2;
		_eftabSz = _eftabLen*OFF_SIZE;
		_ftabLen = (1 << (_ftabChars*2))+1;
		_ftabSz = _ftabLen*OFF_SIZE;
		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
		_offsSz = (uint64_t)_offsLen*OFF_SIZE;
		_isaLen = (_isaRate == -1)? 0 : ((_bwtLen + (1 << _isaRate) - 1) >> _isaRate);
		_isaSz = _isaLen*OFF_SIZE;
		_lineSz = 1 << _lineRate;
		_sideSz = _lineSz * _linesPerSide;
		_sideBwtSz = _sideSz - 2*OFF_SIZE;
		_sideBwtLen = _sideBwtSz*4;
		_numSidePairs = (_bwtSz+(2*_sideBwtSz)-1)/(2*_sideBwtSz);
		_numSides = _numSidePairs*2;
		_numLines = _numSides * _linesPerSide;
		_ebwtTotLen = _numSidePairs * (2*_sideSz);

		if (_isBt2Index) {
			_sideBwtSz = _sideSz - 4*OFF_SIZE;
			_sideBwtLen = _sideBwtSz*4;
			_numSides = (_bwtSz+(_sideBwtSz)-1)/(_sideBwtSz);
			_numSidePairs = _numSides / 2;
			_numLines = _numSides * _linesPerSide;
			_ebwtTotLen = _numSides * _sideSz;
		}

		_ebwtTotSz = _ebwtTotLen;
		assert(repOk());
	}

	TIndexOffU len() const           { return _len; }
	TIndexOffU bwtLen() const        { return _bwtLen; }
	TIndexOffU sz() const            { return _sz; }
	TIndexOffU bwtSz() const         { return _bwtSz; }
	int32_t  lineRate() const      { return _lineRate; }
	int32_t  linesPerSide() const  { return _linesPerSide; }
	int32_t  origOffRate() const   { return _origOffRate; }
	int32_t  offRate() const       { return _offRate; }
	TIndexOffU offMask() const       { return _offMask; }
	int32_t  isaRate() const       { return _isaRate; }
	uint32_t isaMask() const       { return _isaMask; }
	int32_t  ftabChars() const     { return _ftabChars; }
	uint32_t eftabLen() const      { return _eftabLen; }
	uint32_t eftabSz() const       { return _eftabSz; }
	TIndexOffU ftabLen() const       { return _ftabLen; }
	TIndexOffU ftabSz() const        { return _ftabSz; }
	TIndexOffU offsLen() const       { return _offsLen; }
	uint64_t offsSz() const        { return _offsSz; }
	TIndexOffU isaLen() const        { return _isaLen; }
	uint64_t isaSz() const         { return _isaSz; }
	uint32_t lineSz() const        { return _lineSz; }
	uint32_t sideSz() const        { return _sideSz; }
	uint32_t sideBwtSz() const     { return _sideBwtSz; }
	uint32_t sideBwtLen() const    { return _sideBwtLen; }
	uint32_t numSidePairs() const  { return _numSidePairs; } /* check */
	TIndexOffU numSides() const      { return _numSides; }
	TIndexOffU numLines() const      { return _numLines; }
	TIndexOffU ebwtTotLen() const    { return _ebwtTotLen; }
	TIndexOffU ebwtTotSz() const     { return _ebwtTotSz; }
	bool color() const             { return _color; }
	bool entireReverse() const     { return _entireReverse; }
	bool isBt2Index() const { return _isBt2Index; }

	/**
	 * Set a new suffix-array sampling rate, which involves updating
	 * rate, mask, sample length, and sample size.
	 */
	void setOffRate(int __offRate) {
		_offRate = __offRate;
		_offMask = OFF_MASK << _offRate;
		_offsLen = (_bwtLen + (1 << _offRate) - 1) >> _offRate;
		_offsSz = (uint64_t) _offsLen*OFF_SIZE;
	}

	/**
	 * Set a new inverse suffix-array sampling rate, which involves
	 * updating rate, mask, sample length, and sample size.
	 */
	void setIsaRate(int __isaRate) {
		_isaRate = __isaRate;
		_isaMask = OFF_MASK << _isaRate;
		_isaLen = (_bwtLen + (1 << _isaRate) - 1) >> _isaRate;
		_isaSz = (uint64_t)_isaLen*OFF_SIZE;
	}

	/// Check that this EbwtParams is internally consistent
	bool repOk() const {
		assert_gt(_len, 0);
		assert_gt(_lineRate, 3);
		assert_geq(_offRate, 0);
		assert_leq(_ftabChars, 16);
		assert_geq(_ftabChars, 1);
		assert_lt(_lineRate, 32);
		assert_lt(_linesPerSide, 32);
		assert_lt(_ftabChars, 32);
		assert_eq(0, _ebwtTotSz % (_isBt2Index ? _lineSz : 2*_lineSz));
		return true;
	}

	/**
	 * Pretty-print the header contents to the given output stream.
	 */
	void print(ostream& out) const {
		out << "Headers:" << endl
		    << "    len: "          << _len << endl
		    << "    bwtLen: "       << _bwtLen << endl
		    << "    sz: "           << _sz << endl
		    << "    bwtSz: "        << _bwtSz << endl
		    << "    lineRate: "     << _lineRate << endl
		    << "    linesPerSide: " << _linesPerSide << endl
		    << "    offRate: "      << _offRate << endl
		    << "    offMask: 0x"    << hex << _offMask << dec << endl
		    << "    isaRate: "      << _isaRate << endl
		    << "    isaMask: 0x"    << hex << _isaMask << dec << endl
		    << "    ftabChars: "    << _ftabChars << endl
		    << "    eftabLen: "     << _eftabLen << endl
		    << "    eftabSz: "      << _eftabSz << endl
		    << "    ftabLen: "      << _ftabLen << endl
		    << "    ftabSz: "       << _ftabSz << endl
		    << "    offsLen: "      << _offsLen << endl
		    << "    offsSz: "       << _offsSz << endl
		    << "    isaLen: "       << _isaLen << endl
		    << "    isaSz: "        << _isaSz << endl
		    << "    lineSz: "       << _lineSz << endl
		    << "    sideSz: "       << _sideSz << endl
		    << "    sideBwtSz: "    << _sideBwtSz << endl
		    << "    sideBwtLen: "   << _sideBwtLen << endl
		    << "    numSidePairs: " << _numSidePairs << endl
		    << "    numSides: "     << _numSides << endl
		    << "    numLines: "     << _numLines << endl
		    << "    ebwtTotLen: "   << _ebwtTotLen << endl
		    << "    ebwtTotSz: "    << _ebwtTotSz << endl
		    << "    reverse: "      << _entireReverse << endl;
	}

	TIndexOffU _len;
	TIndexOffU _bwtLen;
	TIndexOffU _sz;
	TIndexOffU _bwtSz;
	int32_t  _lineRate;
	int32_t  _linesPerSide;
	int32_t  _origOffRate;
	int32_t  _offRate;
	TIndexOffU _offMask;
	int32_t  _isaRate;
	uint32_t _isaMask;
	int32_t  _ftabChars;
	uint32_t _eftabLen;
	uint32_t _eftabSz;
	TIndexOffU _ftabLen;
	TIndexOffU _ftabSz;
	TIndexOffU _offsLen;
	uint64_t _offsSz;
	TIndexOffU _isaLen;
	uint64_t _isaSz;
	uint32_t _lineSz;
	uint32_t _sideSz;
	uint32_t _sideBwtSz;
	uint32_t _sideBwtLen;
	uint32_t _numSidePairs;
	TIndexOffU _numSides;
	TIndexOffU _numLines;
	TIndexOffU _ebwtTotLen;
	TIndexOffU _ebwtTotSz;
	bool     _color;
	bool     _entireReverse;
	bool     _isBt2Index;
};

/**
 * Exception to throw when a file-realted error occurs.
 */
class EbwtFileOpenException : public std::runtime_error {
public:
	EbwtFileOpenException(const std::string& msg = "") :
		std::runtime_error(msg) { }
};

/**
 * Calculate size of file with given name.
 */
static inline int64_t fileSize(const char* name) {
	std::ifstream f;
	f.open(name, std::ios_base::binary | std::ios_base::in);
	if (!f.good() || f.eof() || !f.is_open()) { return 0; }
	f.seekg(0, std::ios_base::beg);
	std::ifstream::pos_type begin_pos = f.tellg();
	f.seekg(0, std::ios_base::end);
	return static_cast<int64_t>(f.tellg() - begin_pos);
}

// Forward declarations for Ebwt class
struct SideLocus;
template<typename TStr> class EbwtSearchParams;

/**
 * Extended Burrows-Wheeler transform data.
 *
 * An Ebwt may be transferred to and from RAM with calls to
 * evictFromMemory() and loadIntoMemory().  By default, a newly-created
 * Ebwt is not loaded into memory; if the user would like to use a
 * newly-created Ebwt to answer queries, they must first call
 * loadIntoMemory().
 */
template <typename TStr>
class Ebwt {
public:
	typedef typename Value<TStr>::Type TAlphabet;

	#define Ebwt_INITS \
	    _toBigEndian(currentlyBigEndian()), \
	    _overrideOffRate(__overrideOffRate), \
	    _overrideIsaRate(__overrideIsaRate), \
	    _verbose(verbose), \
	    _passMemExc(passMemExc), \
	    _sanity(sanityCheck), \
	    _isBt2Index(isBt2Index), \
	    _fw(__fw), \
	    _in1(NULL), \
	    _in2(NULL), \
	    _zOff(OFF_MASK), \
	    _zEbwtByteOff(OFF_MASK), \
	    _zEbwtBpOff(-1), \
	    _nPat(0), \
	    _nFrag(0), \
	    _plen(NULL), \
	    _rstarts(NULL), \
	    _fchr(NULL), \
	    _ftab(NULL), \
	    _eftab(NULL), \
	    _offs(NULL), \
	    _isa(NULL), \
	    _ebwt(NULL), \
	    _useMm(false), \
	    useShmem_(false), \
	    _refnames(), \
	    mmFile1_(NULL), \
	    mmFile2_(NULL)

#ifdef EBWT_STATS
#define Ebwt_STAT_INITS \
	,mapLFExs_(0llu), \
	mapLFs_(0llu), \
	mapLFcs_(0llu), \
	mapLF1cs_(0llu), \
	mapLF1s_(0llu)
#else
#define Ebwt_STAT_INITS
#endif

	/// Construct an Ebwt from the given input file
	Ebwt(const string& in,
	     int color,
	     int needEntireReverse,
	     bool __fw,
	     int32_t __overrideOffRate = -1,
	     int32_t __overrideIsaRate = -1,
	     bool useMm = false,
	     bool useShmem = false,
	     bool mmSweep = false,
	     bool loadNames = false,
	     bool verbose = false,
	     bool startVerbose = false,
	     bool passMemExc = false,
	     bool sanityCheck = false,
	     bool isBt2Index = false) :
	     Ebwt_INITS
	     Ebwt_STAT_INITS
	{
		assert(!useMm || !useShmem);
#ifdef POPCNT_CAPABILITY 
        ProcessorSupport ps; 
        _usePOPCNTinstruction = ps.POPCNTenabled(); 
#endif 
		_useMm = useMm;
		useShmem_ = useShmem;
		_in1Str = in + ".1." + gEbwt_ext;
		_in2Str = in + ".2." + gEbwt_ext;
		readIntoMemory(
			color,         // expect colorspace reference?
			__fw ? -1 : needEntireReverse, // need REF_READ_REVERSE
			true,          // stop after loading the header portion?
			&_eh,          // params structure to fill in
			mmSweep,       // mmSweep
			loadNames,     // loadNames
			startVerbose); // startVerbose
		// If the offRate has been overridden, reflect that in the
		// _eh._offRate field
		if(_overrideOffRate > _eh._offRate) {
			_eh.setOffRate(_overrideOffRate);
			assert_eq(_overrideOffRate, _eh._offRate);
		}
		// Same with isaRate
		if(_overrideIsaRate > _eh._isaRate) {
			_eh.setIsaRate(_overrideIsaRate);
			assert_eq(_overrideIsaRate, _eh._isaRate);
		}
		assert(repOk());
	}

	/// Construct an Ebwt from the given header parameters and string
	/// vector, optionally using a blockwise suffix sorter with the
	/// given 'bmax' and 'dcv' parameters.  The string vector is
	/// ultimately joined and the joined string is passed to buildToDisk().
	Ebwt(int color,
	     int32_t lineRate,
	     int32_t linesPerSide,
	     int32_t offRate,
	     int32_t isaRate,
	     int32_t ftabChars,
	     int nthreads,
	     const string& file,   // base filename for EBWT files
	     bool __fw,
	     bool useBlockwise,
	     TIndexOffU bmax,
	     TIndexOffU bmaxSqrtMult,
	     TIndexOffU bmaxDivN,
	     int dcv,
	     vector<FileBuf*>& is,
	     vector<RefRecord>& szs,
	     vector<uint32_t>& plens,
	     TIndexOffU sztot,
	     const RefReadInParams& refparams,
	     uint32_t seed,
	     int32_t __overrideOffRate = -1,
	     int32_t __overrideIsaRate = -1,
	     bool verbose = false,
	     bool passMemExc = false,
	     bool sanityCheck = false,
	     bool isBt2Index = false) :
	     Ebwt_INITS
	     Ebwt_STAT_INITS,
	     _eh(joinedLen(szs),
	         lineRate,
	         linesPerSide,
	         offRate,
	         isaRate,
	         ftabChars,
	         color,
	         refparams.reverse == REF_READ_REVERSE,
	         false /* is this a bt2 index? */)
	{
#ifdef POPCNT_CAPABILITY 
        ProcessorSupport ps; 
        _usePOPCNTinstruction = ps.POPCNTenabled(); 
#endif 
		_in1Str = file + ".1." + gEbwt_ext;
		_in2Str = file + ".2." + gEbwt_ext;
		// Open output files
		ofstream fout1(_in1Str.c_str(), ios::binary);
		if(!fout1.good()) {
			cerr << "Could not open index file for writing: \"" << _in1Str << "\"" << endl
			     << "Please make sure the directory exists and that permissions allow writing by" << endl
			     << "Bowtie." << endl;
			throw 1;
		}
		ofstream fout2(_in2Str.c_str(), ios::binary);
		if(!fout2.good()) {
			cerr << "Could not open index file for writing: \"" << _in2Str << "\"" << endl
			     << "Please make sure the directory exists and that permissions allow writing by" << endl
			     << "Bowtie." << endl;
			throw 1;
		}
		// Build
		initFromVector(
			is,
			szs,
			plens,
			sztot,
			refparams,
			fout1,
			fout2,
			file,
			nthreads,
			useBlockwise,
			bmax,
			bmaxSqrtMult,
			bmaxDivN,
			dcv,
			seed);
		// Close output files
		fout1.flush();
		int64_t tellpSz1 = (int64_t)fout1.tellp();
		VMSG_NL("Wrote " << fout1.tellp() << " bytes to primary EBWT file: " << _in1Str);
		fout1.close();
		bool err = false;
		if(tellpSz1 > fileSize(_in1Str.c_str())) {
			err = true;
			cerr << "Index is corrupt: File size for " << _in1Str << " should have been " << tellpSz1
			     << " but is actually " << fileSize(_in1Str.c_str()) << "." << endl;
		}
		fout2.flush();
		int64_t tellpSz2 = (int64_t)fout2.tellp();
		VMSG_NL("Wrote " << fout2.tellp() << " bytes to secondary EBWT file: " << _in2Str);
		fout2.close();
		if(tellpSz2 > fileSize(_in2Str.c_str())) {
			err = true;
			cerr << "Index is corrupt: File size for " << _in2Str << " should have been " << tellpSz2
			     << " but is actually " << fileSize(_in2Str.c_str()) << "." << endl;
		}
		if(err) {
			cerr << "Please check if there is a problem with the disk or if disk is full." << endl;
			throw 1;
		}
		// Reopen as input streams
		VMSG_NL("Re-opening _in1 and _in2 as input streams");
		if(_sanity) {
			VMSG_NL("Sanity-checking Ebwt");
			assert(!isInMemory());
			readIntoMemory(
				color,
				__fw ? -1 : refparams.reverse == REF_READ_REVERSE,
				false,
				NULL,
				false,
				true,
				false);
			sanityCheckAll(refparams.reverse);
			evictFromMemory();
			assert(!isInMemory());
		}
		VMSG_NL("Returning from Ebwt constructor");
	}

	bool isPacked();

	/**
	 * Write the rstarts array given the szs array for the reference.
	 */
	void szsToDisk(const vector<RefRecord>& szs, ostream& os, int reverse) {
		TIndexOffU seq = 0;
		TIndexOffU off = 0;
		TIndexOffU totlen = 0;
		for(unsigned int i = 0; i < szs.size(); i++) {
			if(szs[i].len == 0) continue;
			if(szs[i].first) off = 0;
			off += szs[i].off;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
			if(szs[i].first && szs[i].len > 0) seq++;
#else
			if(szs[i].first) seq++;
#endif
			TIndexOffU seqm1 = seq-1;
			assert_lt(seqm1, _nPat);
			TIndexOffU fwoff = off;
			if(reverse == REF_READ_REVERSE) {
				// Invert pattern idxs
				seqm1 = _nPat - seqm1 - 1;
				// Invert pattern idxs
				assert_leq(off + szs[i].len, _plen[seqm1]);
				fwoff = _plen[seqm1] - (off + szs[i].len);
			}
			writeU<TIndexOffU>(os, totlen, this->toBe()); // offset from beginning of joined string
			writeU<TIndexOffU>(os, seqm1,  this->toBe()); // sequence id
			writeU<TIndexOffU>(os, fwoff,  this->toBe()); // offset into sequence
			totlen += szs[i].len;
			off += szs[i].len;
		}
	}

	/**
	 * Helper for the constructors above.  Takes a vector of text
	 * strings and joins them into a single string with a call to
	 * joinToDisk, which does a join (with padding) and writes some of
	 * the resulting data directly to disk rather than keep it in
	 * memory.  It then constructs a suffix-array producer (what kind
	 * depends on 'useBlockwise') for the resulting sequence.  The
	 * suffix-array producer can then be used to obtain chunks of the
	 * joined string's suffix array.
	 */
	void initFromVector(
		vector<FileBuf*>& is,
		vector<RefRecord>& szs,
		vector<uint32_t>& plens,
		TIndexOffU sztot,
		const RefReadInParams& refparams,
		ofstream& out1,
		ofstream& out2,
		const string& outfile,
		int nthreads,
		bool useBlockwise,
		TIndexOffU bmax,
		TIndexOffU bmaxSqrtMult,
		TIndexOffU bmaxDivN,
		int dcv,
		uint32_t seed)
	{
		// Compose text strings into single string
		VMSG_NL("Calculating joined length");
		TStr s; // holds the entire joined reference after call to joinToDisk
		TIndexOffU jlen;
		jlen = joinedLen(szs);
		assert_geq(jlen, sztot);
		VMSG_NL("Writing header");
		writeFromMemory(true, out1, out2);
		try {
			VMSG_NL("Reserving space for joined string");
			seqan::reserve(s, jlen, Exact());
			VMSG_NL("Joining reference sequences");
			if(refparams.reverse == REF_READ_REVERSE) {
				{
					Timer timer(cout, "  Time to join reference sequences: ", _verbose);
					joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
				} {
					Timer timer(cout, "  Time to reverse reference sequence: ", _verbose);
					vector<RefRecord> tmp;
					reverseInPlace(s);
					reverseRefRecords(szs, tmp, false, false);
					szsToDisk(tmp, out1, refparams.reverse);
				}
			} else {
				Timer timer(cout, "  Time to join reference sequences: ", _verbose);
				joinToDisk(is, szs, plens, sztot, refparams, s, out1, out2, seed);
				szsToDisk(szs, out1, refparams.reverse);
			}
			// Joined reference sequence now in 's'
		} catch(bad_alloc& e) {
			// If we throw an allocation exception in the try block,
			// that means that the joined version of the reference
			// string itself is too larger to fit in memory.  The only
			// alternatives are to tell the user to give us more memory
			// or to try again with a packed representation of the
			// reference (if we haven't tried that already).
			cerr << "Could not allocate space for a joined string of " << jlen << " elements." << endl;
			if(!isPacked() && _passMemExc) {
				// Pass the exception up so that we can retry using a
				// packed string representation
				throw e;
			}
			// There's no point passing this exception on.  The fact
			// that we couldn't allocate the joined string means that
			// --bmax is irrelevant - the user should re-run with
			// ebwt-build-packed
			if(isPacked()) {
				cerr << "Please try running bowtie-build on a computer with more memory." << endl;
			} else {
				cerr << "Please try running bowtie-build in packed mode (-p/--packed) or in automatic" << endl
				     << "mode (-a/--auto), or try again on a computer with more memory." << endl;
			}
			if(sizeof(void*) == 4) {
				cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
				     << "this executable is 32-bit." << endl;
			}
			throw 1;
		}
		// Succesfully obtained joined reference string
		assert_geq(length(s), jlen);
		if(bmax != OFF_MASK) {
			VMSG_NL("bmax according to bmax setting: " << bmax);
		}
		else if(bmaxSqrtMult != OFF_MASK) {
			bmax *= bmaxSqrtMult;
			VMSG_NL("bmax according to bmaxSqrtMult setting: " << bmax);
		}
		else if(bmaxDivN != OFF_MASK) {
			bmax = max<TIndexOffU>(jlen / bmaxDivN, 1);
			VMSG_NL("bmax according to bmaxDivN setting: " << bmax);
		}
		else {
			bmax = (TIndexOffU)sqrt(length(s));
			VMSG_NL("bmax defaulted to: " << bmax);
		}
		int iter = 0;
		bool first = true;
		// Look for bmax/dcv parameters that work.
		while(true) {
			if(!first && bmax < 40 && _passMemExc) {
				cerr << "Could not find approrpiate bmax/dcv settings for building this index." << endl;
				if(!isPacked()) {
					// Throw an exception exception so that we can
					// retry using a packed string representation
					throw bad_alloc();
				} else {
					cerr << "Already tried a packed string representation." << endl;
				}
				cerr << "Please try indexing this reference on a computer with more memory." << endl;
				if(sizeof(void*) == 4) {
					cerr << "If this computer has more than 4 GB of memory, try using a 64-bit executable;" << endl
						 << "this executable is 32-bit." << endl;
				}
				throw 1;
			}
			if(dcv > 4096) dcv = 4096;
			if((iter % 6) == 5 && dcv < 4096 && dcv != 0) {
				dcv <<= 1; // double difference-cover period
			} else {
				bmax -= (bmax >> 2); // reduce by 25%
			}
			VMSG("Using parameters --bmax " << bmax);
			if(dcv == 0) {
				VMSG_NL(" and *no difference cover*");
			} else {
				VMSG_NL(" --dcv " << dcv);
			}
			iter++;
			try {
				{
					VMSG_NL("  Doing ahead-of-time memory usage test");
					// Make a quick-and-dirty attempt to force a bad_alloc iff
					// we would have thrown one eventually as part of
					// constructing the DifferenceCoverSample
					dcv <<= 1;
					TIndexOffU sz = (TIndexOffU)DifferenceCoverSample<TStr>::simulateAllocs(s, dcv >> 1);
					if(nthreads > 1) sz *= (nthreads + 1);
					AutoArray<uint8_t> tmp(sz);
					dcv >>= 1;
					// Likewise with the KarkkainenBlockwiseSA
					sz = (TIndexOffU)KarkkainenBlockwiseSA<TStr>::simulateAllocs(s, bmax);
					AutoArray<uint8_t> tmp2(sz);
					// Now throw in the 'ftab' and 'isaSample' structures
					// that we'll eventually allocate in buildToDisk
					AutoArray<TIndexOffU> ftab(_eh._ftabLen * 2);
					AutoArray<uint8_t> side(_eh._sideSz);
					// Grab another 20 MB out of caution
					AutoArray<uint32_t> extra(20*1024*1024);
					// If we made it here without throwing bad_alloc, then we
					// passed the memory-usage stress test
					VMSG("  Passed!  Constructing with these parameters: --bmax " << bmax << " --dcv " << dcv);
					if(isPacked()) {
						VMSG(" --packed");
					}
					VMSG_NL("");
				}
				VMSG_NL("Constructing suffix-array element generator");
				KarkkainenBlockwiseSA<TStr> bsa(s, bmax, nthreads, dcv, seed, _sanity, _passMemExc, _verbose, outfile);
				assert(bsa.suffixItrIsReset());
				assert_eq(bsa.size(), length(s)+1);
				VMSG_NL("Converting suffix-array elements to index image");
				buildToDisk(bsa, s, out1, out2);
				out1.flush(); out2.flush();
				if(out1.fail() || out2.fail()) {
					cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
					throw 1;
				}
				break;
			} catch(bad_alloc& e) {
				if(_passMemExc) {
					VMSG_NL("  Ran out of memory; automatically trying more memory-economical parameters.");
				} else {
					cerr << "Out of memory while constructing suffix array.  Please try using a smaller" << endl
						 << "number of blocks by specifying a smaller --bmax or a larger --bmaxdivn" << endl;
					throw 1;
				}
			}
			first = false;
		}
		assert(repOk());
		// Now write reference sequence names on the end
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
		assert_geq(this->_refnames.size(), this->_nPat);
#else
		assert_eq(this->_refnames.size(), this->_nPat);
#endif
		for(TIndexOffU i = 0; i < this->_refnames.size(); i++) {
			out1 << this->_refnames[i] << endl;
		}
		out1 << '\0';
		out1.flush(); out2.flush();
		if(out1.fail() || out2.fail()) {
			cerr << "An error occurred writing the index to disk.  Please check if the disk is full." << endl;
			throw 1;
		}
		VMSG_NL("Returning from initFromVector");
	}

	/**
	 * Return the length that the joined string of the given string
	 * list will have.  Note that this is indifferent to how the text
	 * fragments correspond to input sequences - it just cares about
	 * the lengths of the fragments.
	 */
	TIndexOffU joinedLen(vector<RefRecord>& szs) {
		TIndexOffU ret = 0;
		for(unsigned int i = 0; i < szs.size(); i++) {
			ret += szs[i].len;
		}
		return ret;
	}

	/// Destruct an Ebwt
	~Ebwt() {
		// Only free buffers if we're *not* using memory-mapped files
		if(!_useMm) {
			// Delete everything that was allocated in read(false, ...)
			if(_fchr    != NULL) delete[] _fchr;    _fchr    = NULL;
			if(_ftab    != NULL) delete[] _ftab;    _ftab    = NULL;
			if(_eftab   != NULL) delete[] _eftab;   _eftab   = NULL;
			if(_offs != NULL && !useShmem_) {
				delete[] _offs; _offs = NULL;
			} else if(_offs != NULL && useShmem_) {
				FREE_SHARED(_offs);
			}
			if(_isa     != NULL) delete[] _isa;     _isa     = NULL;
			if(_plen    != NULL) delete[] _plen;    _plen    = NULL;
			if(_rstarts != NULL) delete[] _rstarts; _rstarts = NULL;
			if(_ebwt != NULL && !useShmem_) {
				delete[] _ebwt; _ebwt = NULL;
			} else if(_ebwt != NULL && useShmem_) {
				FREE_SHARED(_ebwt);
			}
		}
		if (_in1 != NULL) fclose(_in1);
		if (_in2 != NULL) fclose(_in2);
#ifdef EBWT_STATS
		cout << (_fw ? "Forward index:" : "Mirror index:") << endl;
		cout << "  mapLFEx:   " << mapLFExs_ << endl;
		cout << "  mapLF:     " << mapLFs_   << endl;
		cout << "  mapLF(c):  " << mapLFcs_  << endl;
		cout << "  mapLF1(c): " << mapLF1cs_ << endl;
		cout << "  mapLF(c):  " << mapLF1s_  << endl;
#endif
	}

	/// Accessors
	const EbwtParams& eh() const     { return _eh; }
	TIndexOffU    zOff() const         { return _zOff; }
	TIndexOffU    zEbwtByteOff() const { return _zEbwtByteOff; }
	TIndexOff         zEbwtBpOff() const   { return _zEbwtBpOff; }
	TIndexOffU    nPat() const         { return _nPat; }
	TIndexOffU    nFrag() const        { return _nFrag; }
	TIndexOffU*   fchr() const         { return _fchr; }
	TIndexOffU*   ftab() const         { return _ftab; }
	TIndexOffU*   eftab() const        { return _eftab; }
	TIndexOffU*   offs() const         { return _offs; }
	TIndexOffU*   isa() const          { return _isa; } /* check */
	TIndexOffU*   plen() const         { return _plen; }
	TIndexOffU*   rstarts() const      { return _rstarts; }
	uint8_t*    ebwt() const         { return _ebwt; }
	bool        toBe() const         { return _toBigEndian; }
	bool        verbose() const      { return _verbose; }
	bool        sanityCheck() const  { return _sanity; }
	vector<string>& refnames()       { return _refnames; }
	bool        fw() const           { return _fw; }
#ifdef POPCNT_CAPABILITY 
    bool _usePOPCNTinstruction; 
#endif 

	/// Return true iff the Ebwt is currently in memory
	bool isInMemory() const {
		if(_ebwt != NULL) {
			assert(_eh.repOk());
			assert(_ftab != NULL);
			assert(_eftab != NULL);
			assert(_fchr != NULL);
			assert(_offs != NULL);
			assert(_isa != NULL);
			assert(_rstarts != NULL);
			assert_neq(_zEbwtByteOff, OFF_MASK);
			assert_neq(_zEbwtBpOff, -1);
			return true;
		} else {
			assert(_ftab == NULL);
			assert(_eftab == NULL);
			assert(_fchr == NULL);
			assert(_offs == NULL);
			assert(_rstarts == NULL);
			assert_eq(_zEbwtByteOff, OFF_MASK);
			assert_eq(_zEbwtBpOff, -1);
			return false;
		}
	}

	/// Return true iff the Ebwt is currently stored on disk
	bool isEvicted() const {
		return !isInMemory();
	}

	/**
	 * Load this Ebwt into memory by reading it in from the _in1 and
	 * _in2 streams.
	 */
	void loadIntoMemory(
		int color,
		int needEntireReverse,
		bool loadNames,
		bool verbose)
	{
		readIntoMemory(
			color,      // expect index to be colorspace?
			needEntireReverse, // require reverse index to be concatenated reference reversed
			false,      // stop after loading the header portion?
			NULL,       // params
			false,      // mmSweep
			loadNames,  // loadNames
			verbose);   // startVerbose
	}

	/**
	 * Frees memory associated with the Ebwt.
	 */
	void evictFromMemory() {
		assert(isInMemory());
		if(!_useMm) {
			delete[] _fchr;
			delete[] _ftab;
			delete[] _eftab;
			if(!useShmem_) delete[] _offs;
			delete[] _isa;
			// Keep plen; it's small and the client may want to query it
			// even when the others are evicted.
			//delete[] _plen;
			delete[] _rstarts;
			if(!useShmem_) delete[] _ebwt;
		}
		_fchr  = NULL;
		_ftab  = NULL;
		_eftab = NULL;
		_offs  = NULL;
		_isa   = NULL;
		// Keep plen; it's small and the client may want to query it
		// even when the others are evicted.
		//_plen  = NULL;
		_rstarts = NULL;
		_ebwt    = NULL;
		_zEbwtByteOff = OFF_MASK;
		_zEbwtBpOff = -1;
	}

	/**
	 * Non-static facade for static function ftabHi.
	 */
	TIndexOffU ftabHi(TIndexOffU i) const {
		return Ebwt::ftabHi(_ftab, _eftab, _eh._len, _eh._ftabLen,
		                    _eh._eftabLen, i);
	}

	/**
	 * Get "high interpretation" of ftab entry at index i.  The high
	 * interpretation of a regular ftab entry is just the entry
	 * itself.  The high interpretation of an extended entry is the
	 * second correpsonding ui32 in the eftab.
	 *
	 * It's a static member because it's convenient to ask this
	 * question before the Ebwt is fully initialized.
	 */
	static TIndexOffU ftabHi(TIndexOffU *ftab,
			TIndexOffU *eftab,
			TIndexOffU len,
			TIndexOffU ftabLen,
			TIndexOffU eftabLen,
			TIndexOffU i)
	{
		assert_lt(i, ftabLen);
		if(ftab[i] <= len) {
			return ftab[i];
		} else {
			TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
			assert_lt(efIdx*2+1, eftabLen);
			return eftab[efIdx*2+1];
		}
	}

	/**
	 * Non-static facade for static function ftabLo.
	 */
	TIndexOffU ftabLo(TIndexOffU i) const {
		return Ebwt::ftabLo(_ftab, _eftab, _eh._len, _eh._ftabLen,
		                    _eh._eftabLen, i);
	}

	/**
	 * Get "low interpretation" of ftab entry at index i.  The low
	 * interpretation of a regular ftab entry is just the entry
	 * itself.  The low interpretation of an extended entry is the
	 * first correpsonding ui32 in the eftab.
	 *
	 * It's a static member because it's convenient to ask this
	 * question before the Ebwt is fully initialized.
	 */
	static TIndexOffU ftabLo(TIndexOffU *ftab,
			TIndexOffU *eftab,
			TIndexOffU len,
			TIndexOffU ftabLen,
			TIndexOffU eftabLen,
			TIndexOffU i)
	{
		assert_lt(i, ftabLen);
		if(ftab[i] <= len) {
			return ftab[i];
		} else {
			TIndexOffU efIdx = ftab[i] ^ OFF_MASK;
			assert_lt(efIdx*2+1, eftabLen);
			return eftab[efIdx*2];
		}
	}

	/**
	 * When using read() to create an Ebwt, we have to set a couple of
	 * additional fields in the Ebwt object that aren't part of the
	 * parameter list and are not stored explicitly in the file.  Right
	 * now, this just involves initializing _zEbwtByteOff and
	 * _zEbwtBpOff from _zOff.
	 */
	void postReadInit(EbwtParams& eh) {
		TIndexOffU sideNum     = _zOff / eh._sideBwtLen;
		TIndexOffU sideCharOff = _zOff % eh._sideBwtLen;
		TIndexOffU sideByteOff = sideNum * eh._sideSz;
		_zEbwtByteOff = sideCharOff >> 2;
		assert_lt(_zEbwtByteOff, eh._sideBwtSz);
		_zEbwtBpOff = sideCharOff & 3;
		assert_lt(_zEbwtBpOff, 4);
		if((sideNum & 1) == 0) {
			// This is an even (backward) side
			_zEbwtByteOff = eh._sideBwtSz - _zEbwtByteOff - 1;
			_zEbwtBpOff = 3 - _zEbwtBpOff;
			assert_lt(_zEbwtBpOff, 4);
		}
		_zEbwtByteOff += sideByteOff;
		assert(repOk(eh)); // Ebwt should be fully initialized now
	}

	/**
	 * Pretty-print the Ebwt to the given output stream.
	 */
	void print(ostream& out) const {
		print(out, _eh);
	}

	/**
	 * Pretty-print the Ebwt and given EbwtParams to the given output
	 * stream.
	 */
	void print(ostream& out, const EbwtParams& eh) const {
		eh.print(out); // print params
		out << "Ebwt (" << (isInMemory()? "memory" : "disk") << "):" << endl
		    << "    zOff: "         << _zOff << endl
		    << "    zEbwtByteOff: " << _zEbwtByteOff << endl
		    << "    zEbwtBpOff: "   << _zEbwtBpOff << endl
		    << "    nPat: "  << _nPat << endl
		    << "    plen: ";
		if(_plen == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _plen[0] << endl;
		}
		out << "    rstarts: ";
		if(_rstarts == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _rstarts[0] << endl;
		}
		out << "    ebwt: ";
		if(_ebwt == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _ebwt[0] << endl;
		}
		out << "    fchr: ";
		if(_fchr == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _fchr[0] << endl;
		}
		out << "    ftab: ";
		if(_ftab == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _ftab[0] << endl;
		}
		out << "    eftab: ";
		if(_eftab == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _eftab[0] << endl;
		}
		out << "    offs: ";
		if(_offs == NULL) {
			out << "NULL" << endl;
		} else {
			out << "non-NULL, [0] = " << _offs[0] << endl;
		}
	}

	// Building
	static TStr join(vector<TStr>& l, uint32_t seed);
	static TStr join(vector<FileBuf*>& l, vector<RefRecord>& szs, TIndexOffU sztot, const RefReadInParams& refparams, uint32_t seed);
	void joinToDisk(vector<FileBuf*>& l, vector<RefRecord>& szs, vector<uint32_t>& plens, TIndexOffU sztot, const RefReadInParams& refparams, TStr& ret, ostream& out1, ostream& out2, uint32_t seed = 0);
	void buildToDisk(InorderBlockwiseSA<TStr>& sa, const TStr& s, ostream& out1, ostream& out2);

	// I/O
	void readIntoMemory(int color, int needEntireReverse, bool justHeader, EbwtParams *params, bool mmSweep, bool loadNames, bool startVerbose);
	void writeFromMemory(bool justHeader, ostream& out1, ostream& out2) const;
	void writeFromMemory(bool justHeader, const string& out1, const string& out2) const;

	// Sanity checking
	void printRangeFw(uint32_t begin, uint32_t end) const;
	void printRangeBw(uint32_t begin, uint32_t end) const;
	void sanityCheckUpToSide(TIndexOff upToSide) const;
	void sanityCheckAll(int reverse) const;
	void restore(TStr& s) const;
	void checkOrigs(const vector<String<Dna5> >& os, bool color, bool mirror) const;

	// Searching and reporting
	void joinedToTextOff(TIndexOffU qlen, TIndexOffU off, TIndexOffU& tidx, TIndexOffU& textoff, TIndexOffU& tlen) const;
	inline bool report(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU off, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params) const;
	inline bool reportChaseOne(const String<Dna5>& query, String<char>* quals, String<char>* name, bool color, char primer, char trimc, bool colExEnds, int snpPhred, const BitPairReference* ref, const std::vector<TIndexOffU>& mmui32, const std::vector<uint8_t>& refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, uint16_t cost, uint32_t patid, uint32_t seed, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
	inline bool reportReconstruct(const String<Dna5>& query, String<char>* quals, String<char>* name, String<Dna5>& lbuf, String<Dna5>& rbuf, const TIndexOffU *mmui32, const char* refcs, size_t numMms, TIndexOffU i, TIndexOffU top, TIndexOffU bot, uint32_t qlen, int stratum, const EbwtSearchParams<TStr>& params, SideLocus *l = NULL) const;
	inline int rowL(const SideLocus& l) const;
	inline TIndexOffU countUpTo(const SideLocus& l, int c) const;
	inline void countUpToEx(const SideLocus& l, TIndexOffU* pairs) const;
	inline TIndexOffU countFwSide(const SideLocus& l, int c) const;
	inline void countFwSideEx(const SideLocus& l, TIndexOffU *pairs) const;
	inline TIndexOffU countBwSide(const SideLocus& l, int c) const;
	inline void countBwSideEx(const SideLocus& l, TIndexOffU *pairs) const;
	inline TIndexOffU countBt2Side(const SideLocus& l, int c) const;
	inline void countBt2SideEx(const SideLocus& l, TIndexOffU *pairs) const;
	inline TIndexOffU mapLF(const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
	inline void mapLFEx(const SideLocus& l, TIndexOffU *pairs ASSERT_ONLY(, bool overrideSanity = false)) const;
	inline void mapLFEx(const SideLocus& ltop, const SideLocus& lbot, TIndexOffU *tops, TIndexOffU *bots ASSERT_ONLY(, bool overrideSanity = false)) const;
	inline TIndexOffU mapLF(const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
	inline TIndexOffU mapLF1(TIndexOffU row, const SideLocus& l, int c ASSERT_ONLY(, bool overrideSanity = false)) const;
	inline int mapLF1(TIndexOffU& row, const SideLocus& l ASSERT_ONLY(, bool overrideSanity = false)) const;
	/// Check that in-memory Ebwt is internally consistent with respect
	/// to given EbwtParams; assert if not
	bool inMemoryRepOk(const EbwtParams& eh) const {
		assert_leq(ValueSize<TAlphabet>::VALUE, 4);
		assert_geq(_zEbwtBpOff, 0);
		assert_lt(_zEbwtBpOff, 4);
		assert_lt(_zEbwtByteOff, eh._ebwtTotSz);
		assert_lt(_zOff, eh._bwtLen);
		assert(_rstarts != NULL);
		assert_geq(_nFrag, _nPat);
		return true;
	}

	/// Check that in-memory Ebwt is internally consistent; assert if
	/// not
	bool inMemoryRepOk() const {
		return repOk(_eh);
	}

	/// Check that Ebwt is internally consistent with respect to given
	/// EbwtParams; assert if not
	bool repOk(const EbwtParams& eh) const {
		assert(_eh.repOk());
		if(isInMemory()) {
			return inMemoryRepOk(eh);
		}
		return true;
	}

	/// Check that Ebwt is internally consistent; assert if not
	bool repOk() const {
		return repOk(_eh);
	}

	bool       _toBigEndian;
	int32_t    _overrideOffRate;
	int32_t    _overrideIsaRate;
	bool       _verbose;
	bool       _passMemExc;
	bool       _sanity;
	bool       _isBt2Index;
	bool       _fw;     // true iff this is a forward index
	FILE      *_in1;    // input fd for primary index file
	FILE      *_in2;    // input fd for secondary index file
	string     _in1Str; // filename for primary index file
	string     _in2Str; // filename for secondary index file
	TIndexOffU   _zOff;
	TIndexOffU   _zEbwtByteOff;
	TIndexOff        _zEbwtBpOff;
	TIndexOffU   _nPat;  /// number of reference texts
	TIndexOffU   _nFrag; /// number of fragments
	TIndexOffU*  _plen;
	TIndexOffU*  _rstarts; // starting offset of fragments / text indexes
	// _fchr, _ftab and _eftab are expected to be relatively small
	// (usually < 1MB, perhaps a few MB if _fchr is particularly large
	// - like, say, 11).  For this reason, we don't bother with writing
	// them to disk through separate output streams; we
	TIndexOffU*  _fchr;
	TIndexOffU*  _ftab;
	TIndexOffU*  _eftab; // "extended" entries for _ftab
	// _offs may be extremely large.  E.g. for DNA w/ offRate=4 (one
	// offset every 16 rows), the total size of _offs is the same as
	// the total size of the input sequence
	TIndexOffU*  _offs;
	TIndexOffU*  _isa;
	// _ebwt is the Extended Burrows-Wheeler Transform itself, and thus
	// is at least as large as the input sequence.
	uint8_t*   _ebwt;
	bool       _useMm;        /// use memory-mapped files to hold the index
	bool       useShmem_;     /// use shared memory to hold large parts of the index
	vector<string> _refnames; /// names of the reference sequences
	char *mmFile1_;
	char *mmFile2_;
	EbwtParams _eh;

#ifdef BOWTIE_64BIT_INDEX
	static const int      default_lineRate = 7;
#else
	static const int      default_lineRate = 6;
#endif

	#ifdef EBWT_STATS
	uint64_t   mapLFExs_;
	uint64_t   mapLFs_;
	uint64_t   mapLFcs_;
#endif

private:

	ostream& log() const {
		return cout; // TODO: turn this into a parameter
	}

	/// Print a verbose message and flush (flushing is helpful for
	/// debugging)
	void verbose(const string& s) const {
		if(this->verbose()) {
			this->log() << s;
			this->log().flush();
		}
	}
};

/// Specialization for packed Ebwts - return true
template<>
bool Ebwt<String<Dna, Packed<> > >::isPacked() {
	return true;
}

/// By default, Ebwts are not packed
template<typename TStr>
bool Ebwt<TStr>::isPacked() {
	return false;
}

/**
 * Structure encapsulating search parameters, such as whether and how
 * to backtrack and how to deal with multiple equally-good hits.
 */
template<typename TStr>
class EbwtSearchParams {
public:
	EbwtSearchParams(HitSinkPerThread& sink,
	                 const vector<String<Dna5> >& texts,
	                 bool fw = true,
	                 bool ebwtFw = true) :
		_sink(sink),
		_texts(texts),
		_patid(0xffffffff),
		_fw(fw) { }

	HitSinkPerThread& sink() const { return _sink; }
	void setPatId(uint32_t patid)  { _patid = patid; }
	uint32_t patId() const         { return _patid; }
	void setFw(bool fw)            { _fw = fw; }
	bool fw() const                { return _fw; }
	/**
	 * Report a hit.  Returns true iff caller can call off the search.
	 */
	bool reportHit(const String<Dna5>& query, // read sequence
	               String<char>* quals, // read quality values
	               String<char>* name,  // read name
	               bool color,          // true -> read is colorspace
	               char primer,         // primer base trimmed from beginning
	               char trimc,          // first color trimmed from beginning
	               bool colExEnds,      // true -> exclude nucleotides at extreme ends after decoding
	               int snpPhred,        // penalty for a SNP
	               const BitPairReference* ref, // reference (= NULL if not necessary)
	               bool ebwtFw,         // whether index is forward (true) or mirror (false)
	               const std::vector<TIndexOffU>& mmui32, // mismatch list
	               const std::vector<uint8_t>& refcs,  // reference characters
	               size_t numMms,      // # mismatches
	               UPair h,          // ref coords
	               UPair mh,         // mate's ref coords
	               bool mfw,           // mate's orientation
	               uint16_t mlen,      // mate length
	               UPair a,          // arrow pair
	               uint32_t tlen,      // length of text
	               uint32_t qlen,      // length of query
	               int stratum,        // alignment stratum
	               uint16_t cost,      // cost of alignment
	               uint32_t oms,       // approx. # other valid alignments
	               uint32_t patid,
	               uint32_t seed,
	               uint8_t mate) const
	{
#ifndef NDEBUG
		// Check that no two elements of the mms array are the same
		for(size_t i = 0; i < numMms; i++) {
			for(size_t j = i+1; j < numMms; j++) {
				assert_neq(mmui32[i], mmui32[j]);
			}
		}
#endif
		// If ebwtFw is true, then 'query' and 'quals' are reversed
		// If _fw is false, then 'query' and 'quals' are reverse complemented
		assert(!color || ref != NULL);
		assert(quals != NULL);
		assert(name != NULL);
		assert_eq(mmui32.size(), refcs.size());
		assert_leq(numMms, mmui32.size());
		assert_gt(qlen, 0);
		Hit hit;
		hit.stratum = stratum;
		hit.cost = cost;
		hit.patSeq = query;
		hit.quals = *quals;
		if(!ebwtFw) {
			// Re-reverse the pattern and the quals back to how they
			// appeared in the read file
			::reverseInPlace(hit.patSeq);
			::reverseInPlace(hit.quals);
		}
		if(color) {
			hit.colSeq = hit.patSeq;
			hit.colQuals = hit.quals;
			hit.crefcs.resize(qlen, 0);
			// Turn the mmui32 and refcs arrays into the mm FixedBitset and
			// the refc vector
			for(size_t i = 0; i < numMms; i++) {
				if (ebwtFw != _fw) {
					// The 3' end is on the left but the mm vector encodes
					// mismatches w/r/t the 5' end, so we flip
					uint32_t off = qlen - mmui32[i] - 1;
					hit.cmms.set(off);
					hit.crefcs[off] = refcs[i];
				} else {
					hit.cmms.set(mmui32[i]);
					hit.crefcs[i] = refcs[i];
				}
			}
			assert(ref != NULL);
			char read[1024];
			uint32_t rfbuf[(1024+16)/4];
			ASSERT_ONLY(char rfbuf2[1024]);
			char qual[1024];
			char ns[1024];
			char cmm[1024];
			char nmm[1024];
			int cmms = 0;
			int nmms = 0;
			// TODO: account for indels when calculating these bounds
			size_t readi = 0;
			size_t readf = seqan::length(hit.patSeq);
			size_t refi = 0;
			size_t reff = readf + 1;
			bool maqRound = false;
			for(size_t i = 0; i < qlen + 1; i++) {
				if(i < qlen) {
					read[i] = (int)hit.patSeq[i];
					qual[i] = mmPenalty(maqRound, phredCharToPhredQual(hit.quals[i]));
				}
				ASSERT_ONLY(rfbuf2[i] = ref->getBase(h.first, h.second + i));
			}
			int offset = ref->getStretch(rfbuf, h.first, h.second, qlen + 1);
			char *rf = (char*)rfbuf + offset;
			for(size_t i = 0; i < qlen + 1; i++) {
				assert_eq(rf[i], rfbuf2[i]);
				rf[i] = (1 << rf[i]);
			}
			decodeHit(
				read,  // ASCII colors, '0', '1', '2', '3', '.'
				qual,  // ASCII quals, Phred+33 encoded
				readi, // offset of first character within 'read' to consider
				readf, // offset of last char (exclusive) in 'read' to consider
				rf,    // reference sequence, as masks
				refi, // offset of first character within 'ref' to consider
				reff, // offset of last char (exclusive) in 'ref' to consider
				snpPhred, // penalty incurred by a SNP
				ns,  // decoded nucleotides are appended here
				cmm, // where the color mismatches are in the string
				nmm, // where nucleotide mismatches are in the string
				cmms, // number of color mismatches
				nmms);// number of nucleotide mismatches
			size_t nqlen = qlen + (colExEnds ? -1 : 1);
			seqan::resize(hit.patSeq, nqlen);
			seqan::resize(hit.quals, nqlen);
			hit.refcs.resize(nqlen);
			size_t lo = colExEnds ? 1 : 0;
			size_t hi = colExEnds ? qlen : qlen+1;
			size_t destpos = 0;
			for(size_t i = lo; i < hi; i++, destpos++) {
				// Set sequence character
				assert_leq(ns[i], 4);
				assert_geq(ns[i], 0);
				hit.patSeq[destpos] = (Dna5)(int)ns[i];
				// Set initial quality
				hit.quals[destpos] = '!';
				// Color mismatches penalize quality
				if(i > 0) {
					if(cmm[i-1] == 'M') {
						if((int)hit.quals[destpos] + (int)qual[i-1] > 126) {
							hit.quals[destpos] = 126;
						} else {
							hit.quals[destpos] += qual[i-1];
						}
					} else if((int)hit.colSeq[i-1] != 4) {
						hit.quals[destpos] -= qual[i-1];
					}
				}
				if(i < qlen) {
					if(cmm[i] == 'M') {
						if((int)hit.quals[destpos] + (int)qual[i] > 126) {
							hit.quals[destpos] = 126;
						} else {
							hit.quals[destpos] += qual[i];
						}
					} else if((int)hit.patSeq[i] != 4) {
						hit.quals[destpos] -= qual[i];
					}
				}
				if(hit.quals[destpos] < '!') {
					hit.quals[destpos] = '!';
				}
				if(nmm[i] != 'M') {
					uint32_t off = (uint32_t)i - (colExEnds? 1:0);
					if(!_fw) off = (uint32_t)nqlen - off - 1;
					assert_lt(off, nqlen);
					hit.mms.set(off);
					hit.refcs[off] = "ACGT"[ref->getBase(h.first, h.second+i)];
				}
			}
			if(colExEnds) {
				// Extreme bases have been removed; that makes the
				// nucleotide alignment one character shorter than the
				// color alignment
				qlen--; mlen--;
				// It also shifts the alignment's offset up by 1
				h.second++;
			} else {
				// Extreme bases are included; that makes the
				// nucleotide alignment one character longer than the
				// color alignment
				qlen++; mlen++;
			}
		} else {
			// Turn the mmui32 and refcs arrays into the mm FixedBitset and
			// the refc vector
			hit.refcs.resize(qlen, 0);
			for(size_t i = 0; i < numMms; i++) {
				if (ebwtFw != _fw) {
					// The 3' end is on the left but the mm vector encodes
					// mismatches w/r/t the 5' end, so we flip
					uint32_t off = qlen - mmui32[i] - 1;
					hit.mms.set(off);
					hit.refcs[off] = refcs[i];
				} else {
					hit.mms.set(mmui32[i]);
					hit.refcs[mmui32[i]] = refcs[i];
				}
			}
		}
		// Check the hit against the original text, if it's available
		if(_texts.size() > 0) {
			assert_lt(h.first, _texts.size());
			FixedBitset<1024> diffs;
			// This type of check assumes that only mismatches are
			// possible.  If indels are possible, then we either need
			// the caller to provide information about indel locations,
			// or we need to extend this to a more complicated check.
			assert_leq(h.second + qlen, length(_texts[h.first]));
			for(size_t i = 0; i < qlen; i++) {
				assert_neq(4, (int)_texts[h.first][h.second + i]);
				// Forward pattern appears at h
				if((int)hit.patSeq[i] != (int)_texts[h.first][h.second + i]) {
					uint32_t qoff = (uint32_t)i;
					// if ebwtFw != _fw the 3' end is on on the
					// left end of the pattern, but the diff vector
					// should encode mismatches w/r/t the 5' end,
					// so we flip
					if (_fw) diffs.set(qoff);
					else     diffs.set(qlen - qoff - 1);
				}
			}
			if(diffs != hit.mms) {
				// Oops, mismatches were not where we expected them;
				// print a diagnostic message before asserting
				cerr << "Expected " << hit.mms.str() << " mismatches, got " << diffs.str() << endl;
				cerr << "  Pat:  " << hit.patSeq << endl;
				cerr << "  Tseg: ";
				for(size_t i = 0; i < qlen; i++) {
					cerr << _texts[h.first][h.second + i];
				}
				cerr << endl;
				cerr << "  mmui32: ";
				for(size_t i = 0; i < numMms; i++) {
					cerr << mmui32[i] << " ";
				}
				cerr << endl;
				cerr << "  FW: " << _fw << endl;
				cerr << "  Ebwt FW: " << ebwtFw << endl;
			}
			if(diffs != hit.mms) assert(false);
		}
		hit.h = h;
		hit.patId = ((patid == 0xffffffff) ? _patid : patid);
		hit.patName = *name;
		hit.mh = mh;
		hit.fw = _fw;
		hit.mfw = mfw;
		hit.mlen = mlen;
		hit.oms = oms;
		hit.mate = mate;
		hit.color = color;
		hit.primer = primer;
		hit.trimc = trimc;
		hit.seed = seed;
		assert(hit.repOk());
		return sink().reportHit(hit, stratum);
	}
private:
	HitSinkPerThread& _sink;
	const vector<String<Dna5> >& _texts; // original texts, if available (if not
	                            // available, _texts.size() == 0)
	uint32_t _patid;      // id of current read
	bool _fw;             // current read is forward-oriented
};

/**
 * Encapsulates a location in the bwt text in terms of the side it
 * occurs in and its offset within the side.
 */
struct SideLocus {
	SideLocus() :
	_sideByteOff(0),
	_sideNum(0),
	_charOff(0),
	_fw(true),
	_by(-1),
	_bp(-1) { }

	/**
	 * Construct from row and other relevant information about the Ebwt.
	 */
	SideLocus(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
		initFromRow(row, ep, ebwt);
	}

	/**
	 * Init two SideLocus objects from a top/bot pair, using the result
	 * from one call to initFromRow to possibly avoid a second call.
	 */
	static void initFromTopBot(TIndexOffU top,
	                           TIndexOffU bot,
	                           const EbwtParams& ep,
	                           const uint8_t* ebwt,
	                           SideLocus& ltop,
	                           SideLocus& lbot)
	{
		const TIndexOffU sideBwtLen = ep._sideBwtLen;
		const uint32_t sideBwtSz  = ep._sideBwtSz;
		assert_gt(bot, top);
		ltop.initFromRow(top, ep, ebwt);
		TIndexOffU spread = bot - top;
		if(ltop._charOff + spread < sideBwtLen) {
			lbot._charOff = (uint32_t)(ltop._charOff + spread);
			lbot._sideNum = ltop._sideNum;
			lbot._sideByteOff = ltop._sideByteOff;
			lbot._fw = ep._isBt2Index ? true : ltop._fw;
			lbot._by = lbot._charOff >> 2;
			assert_lt(lbot._by, (int)sideBwtSz);
			if(!lbot._fw) lbot._by = sideBwtSz - lbot._by - 1;
			lbot._bp = lbot._charOff & 3;
			if(!lbot._fw) lbot._bp ^= 3;
		} else {
			lbot.initFromRow(bot, ep, ebwt);
		}
	}

	/**
	 * Calculate SideLocus based on a row and other relevant
	 * information about the shape of the Ebwt.
	 */
	void initFromRow(TIndexOffU row, const EbwtParams& ep, const uint8_t* ebwt) {
		const uint32_t sideSz     = ep._sideSz;
		// Side length is hard-coded for now; this allows the compiler
		// to do clever things to accelerate / and %.
		if (ep._isBt2Index) {
			_sideNum = row / (48*OFF_SIZE);
			_charOff = row % (48*OFF_SIZE);
		} else {
			_sideNum = row / (56*OFF_SIZE);
			_charOff = row % (56*OFF_SIZE);
		}
		_sideByteOff = _sideNum * sideSz;
		assert_leq(row, ep._len);
		assert_leq(_sideByteOff + sideSz, ep._ebwtTotSz);
#ifndef NO_PREFETCH
		__builtin_prefetch((const void *)(ebwt + _sideByteOff),
		                   0 /* prepare for read */,
		                   PREFETCH_LOCALITY);
#endif
		// prefetch this side too
		_fw = ep._isBt2Index ? true : ((_sideNum & 1) != 0); // odd-numbered sides are forward
		_by = _charOff >> 2; // byte within side
		assert_lt(_by, (int)ep._sideBwtSz);
		_bp = _charOff & 3;  // bit-pair within byte
		if(!_fw) {
			_by = ep._sideBwtSz - _by - 1;
			_bp ^= 3;
		}
	}

	/// Return true iff this is an initialized SideLocus
	bool valid() {
		return _bp != -1;
	}

	/// Make this look like an invalid SideLocus
	void invalidate() {
		_bp = -1;
	}

	const uint8_t *side(const uint8_t* ebwt) const {
		return ebwt + _sideByteOff;
	}

	const uint8_t *oside(const uint8_t* ebwt) const {
		return ebwt + _sideByteOff + (_fw? (-128) : (128));
	}

	TIndexOffU _sideByteOff; // offset of top side within ebwt[]
	TIndexOffU _sideNum;     // index of side
	uint16_t _charOff;     // character offset within side
	bool _fw;              // side is forward or backward?
	int16_t _by;           // byte within side (not adjusted for bw sides)
	int8_t _bp;            // bitpair within byte (not adjusted for bw sides)
};

#include "ebwt_search_backtrack.h"

///////////////////////////////////////////////////////////////////////
//
// Functions for printing and sanity-checking Ebwts
//
///////////////////////////////////////////////////////////////////////

/**
 * Given a range of positions in the EBWT array within the BWT portion
 * of a forward side, print the characters at those positions along
 * with a summary occ[] array.
 */
template<typename TStr>
void Ebwt<TStr>::printRangeFw(uint32_t begin, uint32_t end) const {
	assert(isInMemory());
	uint32_t occ[] = {0, 0, 0, 0};
	assert_gt(end, begin);
	for(uint32_t i = begin; i < end; i++) {
		uint8_t by = this->_ebwt[i];
		for(int j = 0; j < 4; j++) {
			// Unpack from lowest to highest bit pair
			int twoBit = unpack_2b_from_8b(by, j);
			occ[twoBit]++;
			cout << "ACGT"[twoBit];
		}
		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
	}
	cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
}

/**
 * Given a range of positions in the EBWT array within the BWT portion
 * of a backward side, print the characters at those positions along
 * with a summary occ[] array.
 */
template<typename TStr> /* check - update */
void Ebwt<TStr>::printRangeBw(uint32_t begin, uint32_t end) const {
	assert(isInMemory());
	uint32_t occ[] = {0, 0, 0, 0};
	assert_gt(end, begin);
	for(uint32_t i = end-1; i >= begin; i--) {
		uint8_t by = this->_ebwt[i];
		for(int j = 3; j >= 0; j--) {
			// Unpack from lowest to highest bit pair
			int twoBit = unpack_2b_from_8b(by, j);
			occ[twoBit]++;
			cout << "ACGT"[twoBit];
		}
		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
		if(i == 0) break;
	}
	cout << ":{" << occ[0] << "," << occ[1] << "," << occ[2] << "," << occ[3] << "}" << endl;
}

/**
 * Check that the ebwt array is internally consistent up to (and not
 * including) the given side index by re-counting the chars and
 * comparing against the embedded occ[] arrays.
 */
template<typename TStr>
void Ebwt<TStr>::sanityCheckUpToSide(TIndexOff upToSide) const {
	assert(isInMemory());
	TIndexOffU occ[] = {0, 0, 0, 0};
	ASSERT_ONLY(TIndexOffU occ_save[] = {0, 0});
	TIndexOffU cur = 0; // byte pointer
	const EbwtParams& eh = this->_eh;
	bool fw = false;
	while(cur < (TIndexOffU)(upToSide * eh._sideSz)) {
		assert_leq(cur + eh._sideSz, eh._ebwtTotLen);
		for(uint32_t i = 0; i < eh._sideBwtSz; i++) {
			uint8_t by = this->_ebwt[cur + (fw ? i : eh._sideBwtSz-i-1)];
			for(int j = 0; j < 4; j++) {
				// Unpack from lowest to highest bit pair
				int twoBit = unpack_2b_from_8b(by, fw ? j : 3-j);
				occ[twoBit]++;
				//if(_verbose) cout << "ACGT"[twoBit];
			}
			assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % 4);
		}
		assert_eq(0, (occ[0] + occ[1] + occ[2] + occ[3]) % eh._sideBwtLen);
		if(fw) {
			// Finished forward bucket; check saved [G] and [T]
			// against the two uint32_ts encoded here
			ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz]));
			ASSERT_ONLY(TIndexOffU gs = u32ebwt[0]);
			ASSERT_ONLY(TIndexOffU ts = u32ebwt[1]);
			assert_eq(gs, occ_save[0]);
			assert_eq(ts, occ_save[1]);
			fw = false;
		} else {
			// Finished backward bucket; check current [A] and [C]
			// against the two uint32_ts encoded here
			ASSERT_ONLY(TIndexOffU *u32ebwt = reinterpret_cast<TIndexOffU*>(&this->_ebwt[cur + eh._sideBwtSz]));
			ASSERT_ONLY(TIndexOffU as = u32ebwt[0]);
			ASSERT_ONLY(TIndexOffU cs = u32ebwt[1]);
			assert(as == occ[0] || as == occ[0]-1); // one 'a' is a skipped '$' and doesn't count toward occ[]
			assert_eq(cs, occ[1]);
			ASSERT_ONLY(occ_save[0] = occ[2]); // save gs
			ASSERT_ONLY(occ_save[1] = occ[3]); // save ts
			fw = true;
		}
		cur += eh._sideSz;
	}
}

/**
 * Sanity-check various pieces of the Ebwt
 */
template<typename TStr>
void Ebwt<TStr>::sanityCheckAll(int reverse) const {
	const EbwtParams& eh = this->_eh;
	assert(isInMemory());
	// Check ftab
	for(TIndexOffU i = 1; i < eh._ftabLen; i++) {
		assert_geq(this->ftabHi(i), this->ftabLo(i-1));
		assert_geq(this->ftabLo(i), this->ftabHi(i-1));
		assert_leq(this->ftabHi(i), eh._bwtLen+1);
	}
	assert_eq(this->ftabHi(eh._ftabLen-1), eh._bwtLen);

	// Check offs
	TIndexOff seenLen = (eh._bwtLen + 31) >> ((TIndexOffU)5);
	TIndexOff *seen;
	try {
		seen = new TIndexOff[seenLen]; // bitvector marking seen offsets
	} catch(bad_alloc& e) {
		cerr << "Out of memory allocating seen[] at " << __FILE__ << ":" << __LINE__ << endl;
		throw e;
	}
	memset(seen, 0, OFF_SIZE * seenLen);
	TIndexOffU offsLen = eh._offsLen;
	for(TIndexOffU i = 0; i < offsLen; i++) {
		assert_lt(this->_offs[i], eh._bwtLen);
		TIndexOff w = this->_offs[i] >> 5;
		TIndexOff r = this->_offs[i] & 31;
		assert_eq(0, (seen[w] >> r) & 1); // shouldn't have been seen before
		seen[w] |= (1 << r);
	}
	delete[] seen;

	// Check nPat
	assert_gt(this->_nPat, 0);

	// Check plen, flen
	for(TIndexOffU i = 0; i < this->_nPat; i++) {
		assert_geq(this->_plen[i], 0);
	}

	// Check rstarts
	for(TIndexOffU i = 0; i < this->_nFrag-1; i++) {
		assert_gt(this->_rstarts[(i+1)*3], this->_rstarts[i*3]);
		if(reverse == REF_READ_REVERSE) {
			assert(this->_rstarts[(i*3)+1] >= this->_rstarts[((i+1)*3)+1]);
		} else {
			assert(this->_rstarts[(i*3)+1] <= this->_rstarts[((i+1)*3)+1]);
		}
	}

	// Check ebwt
	sanityCheckUpToSide(eh._numSides);
	VMSG_NL("Ebwt::sanityCheck passed");
}

///////////////////////////////////////////////////////////////////////
//
// Functions for searching Ebwts
//
///////////////////////////////////////////////////////////////////////

/**
 * Return the final character in row i (i.e. the i'th character in the
 * BWT transform).  Note that the 'L' in the name of the function
 * stands for 'last', as in the literature.
 */
template<typename TStr>
inline int Ebwt<TStr>::rowL(const SideLocus& l) const {
	// Extract and return appropriate bit-pair
#ifdef SIXTY4_FORMAT
	return (((uint64_t*)l.side(this->_ebwt))[l._by >> 3] >> ((((l._by & 7) << 2) + l._bp) << 1)) & 3;
#else
	return unpack_2b_from_8b(l.side(this->_ebwt)[l._by], l._bp);
#endif
}

/**
 * Inline-function version of the above.  This does not always seem to
 * be inlined
 */
#if 0
// Use gcc's intrinsic popcountll.  I don't recommend it because it
// seems to be somewhat slower than the bit-bashing pop64 routine both
// on an AMD server and on an Intel workstation.  On the other hand,
// perhaps when the builtin is used GCC is smart enough to insert a
// pop-count instruction on architectures that have one (e.g. Itanium).
// For now, it's disabled.
#define pop64(x) __builtin_popcountll(x)
#elif 0
__declspec naked int __stdcall pop64
(uint64_t v)
{
static const uint64_t C55 = 0x5555555555555555ll;
static const uint64_t C33 = 0x3333333333333333ll;
static const uint64_t C0F = 0x0F0F0F0F0F0F0F0Fll;
__asm {
   MOVD      MM0, [ESP+4] ;v_low
   PUNPCKLDQ MM0, [ESP+8] ;v
   MOVQ      MM1, MM0     ;v
   PSRLD     MM0, 1       ;v >> 1
   PAND      MM0, [C55]   ;(v >> 1) & 0x55555555
   PSUBD     MM1, MM0     ;w = v - ((v >> 1) & 0x55555555)
   MOVQ      MM0, MM1     ;w
   PSRLD     MM1, 2       ;w >> 2
   PAND      MM0, [C33]   ;w & 0x33333333
   PAND      MM1, [C33]   ;(w >> 2)  & 0x33333333
   PADDD     MM0, MM1     ;x = (w & 0x33333333) +
                          ; ((w >> 2) & 0x33333333)
   MOVQ      MM1, MM0     ;x
   PSRLD     MM0, 4       ;x >> 4
   PADDD     MM0, MM1     ;x + (x >> 4)
   PAND      MM0, [C0F]   ;y = (x + (x >> 4) & 0x0F0F0F0F)
   PXOR      MM1, MM1     ;0
   PSADBW    (MM0, MM1)   ;sum across all 8 bytes
   MOVD      EAX, MM0     ;result in EAX per calling
                          ; convention
   EMMS                   ;clear MMX state
   RET  8                 ;pop 8-byte argument off stack
                          ; and return
   }
}
#elif 0
// Use a bytewise LUT version of popcount.  This is slower than the
// bit-bashing pop64 routine both on an AMD server and on an Intel
// workstation.  It seems to be about the same speed as the GCC builtin
// on Intel, and a bit faster than it on AMD.  For now, it's disabled.
const int popcntU8Table[256] = {
    0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
    3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
};

// Use this bytewise population count table
inline static int pop64(uint64_t x) {
	const unsigned char * p = (const unsigned char *) &x;
	return popcntU8Table[p[0]] +
	       popcntU8Table[p[1]] +
	       popcntU8Table[p[2]] +
	       popcntU8Table[p[3]] +
	       popcntU8Table[p[4]] +
	       popcntU8Table[p[5]] +
	       popcntU8Table[p[6]] +
	       popcntU8Table[p[7]];
}
#else
#ifdef POPCNT_CAPABILITY   // wrapping of "struct"
struct USE_POPCNT_GENERIC {
#endif
// Use this standard bit-bashing population count
inline static int pop64(uint64_t x) {
   x = x - ((x >> 1) & 0x5555555555555555llu);
   x = (x & 0x3333333333333333llu) + ((x >> 2) & 0x3333333333333333llu);
   x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0Fllu;
   x = x + (x >> 8);
   x = x + (x >> 16);
   x = x + (x >> 32);
   return x & 0x3F;
}
#ifdef POPCNT_CAPABILITY  // wrapping a "struct"
};
#endif

#ifdef POPCNT_CAPABILITY
    struct USE_POPCNT_INSTRUCTION {
        inline static int pop64(uint64_t x) {
            int64_t count;
            asm ("popcntq %[x],%[count]\n": [count] "=&r" (count): [x] "r" (x));
            return count;
        }
    };
#endif

#endif

/**
 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
 * within a 64-bit argument.
 */
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static int countInU64(int c, uint64_t dw) {
	uint64_t c0 = c_table[c];
	uint64_t x0 = dw ^ c0;
	uint64_t x1 = (x0 >> 1);
	uint64_t x2 = x1 & (0x5555555555555555llu);
	uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
    uint64_t tmp = Operation().pop64(x3);
#else
    uint64_t tmp = pop64(x3);
#endif
	return (int) tmp;
}

/**
 * Tricky-bit-bashing bitpair counting for given two-bit value (0-3)
 * within a 64-bit argument.
 *
 * Function gets 2.32% in profile
 */
#ifdef POPCNT_CAPABILITY
template<typename Operation>
#endif
inline static void countInU64Ex(uint64_t dw, TIndexOffU* arrs) {
	uint64_t c0 = c_table[0];
	uint64_t x0 = dw ^ c0;
	uint64_t x1 = (x0 >> 1);
	uint64_t x2 = x1 & (0x5555555555555555llu);
	uint64_t x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
	uint64_t tmp = Operation().pop64(x3);
#else
	uint64_t tmp = pop64(x3);
#endif
	arrs[0] += (uint32_t) tmp;

        c0 = c_table[1];
        x0 = dw ^ c0;
        x1 = (x0 >> 1);
        x2 = x1 & (0x5555555555555555llu);
        x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
        tmp = Operation().pop64(x3);
#else
        tmp = pop64(x3);
#endif
        arrs[1] += (uint32_t) tmp;

        c0 = c_table[2];
        x0 = dw ^ c0;
        x1 = (x0 >> 1);
        x2 = x1 & (0x5555555555555555llu);
        x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
        tmp = Operation().pop64(x3);
#else
        tmp = pop64(x3);
#endif
        arrs[2] += (uint32_t) tmp;
	
        c0 = c_table[3];
        x0 = dw ^ c0;
        x1 = (x0 >> 1);
        x2 = x1 & (0x5555555555555555llu);
        x3 = x0 & x2;
#ifdef POPCNT_CAPABILITY
        tmp = Operation().pop64(x3);
#else
        tmp = pop64(x3);
#endif
        arrs[3] += (uint32_t) tmp;
}

/**
 * Counts the number of occurrences of character 'c' in the given Ebwt
 * side up to (but not including) the given byte/bitpair (by/bp).
 *
 * This is a performance-critical function.  This is the top search-
 * related hit in the time profile.
 *
 * Function gets 11.09% in profile
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countUpTo(const SideLocus& l, int c) const {
	// Count occurrences of c in each 64-bit (using bit trickery);
	// Someday countInU64() and pop() functions should be
	// vectorized/SSE-ized in case that helps.
	TIndexOffU cCnt = 0;
	const uint8_t *side = l.side(this->_ebwt);
	int i = 0;
#if 1
    #ifdef POPCNT_CAPABILITY
    if ( _usePOPCNTinstruction) {
        for(; i + 7 < l._by; i += 8) {
            cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, *(uint64_t*)&side[i]);
        }
    } 
    else {
        for(; i + 7 < l._by; i += 8) {
            cCnt += countInU64<USE_POPCNT_GENERIC>(c, *(uint64_t*)&side[i]);
        }
    }
    #else
    for(; i + 7 < l._by; i += 8) {
        cCnt += countInU64(c, *(uint64_t*)&side[i]);
    }
    #endif
#else
	for(; i + 2 < l._by; i += 2) {
		cCnt += cCntLUT_16b_4[c][*(uint16_t*)&side[i]];
	}
#endif
#ifdef SIXTY4_FORMAT
	// Calculate number of bit pairs to shift off the end
	const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
	if(bpShiftoff < 32) {
		assert_lt(bpShiftoff, 32);
		const uint64_t sw = (*(uint64_t*)&side[i]) << (bpShiftoff << 1);
        
        #ifdef POPCNT_CAPABILITY
        if (_usePOPCNTinstruction) {
            cCnt += countInU64<USE_POPCNT_INSTRUCTION>(c, sw);
        } 
        else {
            cCnt += countInU64<USE_POPCNT_GENERIC>(c, sw);
        }
        #else
		cCnt += countInU64(c, sw);
        #endif
        
		if(c == 0) cCnt -= bpShiftoff; // we turned these into As
	}
#else
	// Count occurences of c in the rest of the side (using LUT)
	for(; i < l._by; i++) {
		cCnt += cCntLUT_4[0][c][side[i]];
	}
	// Count occurences of c in the rest of the byte
	if(l._bp > 0) {
		cCnt += cCntLUT_4[(int)l._bp][c][side[i]];
	}
#endif
	return cCnt;
}

/**
 * Counts the number of occurrences of character 'c' in the given Ebwt
 * side up to (but not including) the given byte/bitpair (by/bp).
 */
template<typename TStr>
inline void Ebwt<TStr>::countUpToEx(const SideLocus& l, TIndexOffU* arrs) const {
	int i = 0;
	// Count occurrences of c in each 64-bit (using bit trickery);
	// note: this seems does not seem to lend a significant boost to
	// performance.  If you comment out this whole loop (which won't
	// affect correctness - it will just cause the following loop to
	// take up the slack) then runtime does not change noticeably.
	// Someday the countInU64() and pop() functions should be
	// vectorized/SSE-ized in case that helps.
	const uint8_t *side = l.side(this->_ebwt);

#ifdef POPCNT_CAPABILITY
    if (_usePOPCNTinstruction) {
        for(; i+7 < l._by; i += 8) {
            countInU64Ex<USE_POPCNT_INSTRUCTION>(*(uint64_t*)&side[i], arrs);
        }
    } 
    else {
        for(; i+7 < l._by; i += 8) {
            countInU64Ex<USE_POPCNT_GENERIC>(*(uint64_t*)&side[i], arrs);
        }
    }
#else 
	for(; i+7 < l._by; i += 8) {
		countInU64Ex(*(uint64_t*)&side[i], arrs);
	}
#endif

#ifdef SIXTY4_FORMAT
	// Calculate number of bit pairs to shift off the end
	const int bpShiftoff = 32 - (((l._by & 7) << 2) + l._bp);
	assert_leq(bpShiftoff, 32);
	if(bpShiftoff < 32) {
		const uint64_t sw = (*(uint64_t*)&l.side(this->_ebwt)[i]) << (bpShiftoff << 1);

#ifdef POPCNT_CAPABILITY
        if (_usePOPCNTinstruction) {
            countInU64Ex<USE_POPCNT_INSTRUCTION>(sw, arrs);
        }
        else{
            countInU64Ex<USE_POPCNT_GENERIC>(sw, arrs);
        }
#else       
		countInU64Ex(sw, arrs);
#endif

		arrs[0] -= bpShiftoff;
	}
#else
	// Count occurences of c in the rest of the side (using LUT)
	for(; i < l._by; i++) {
		arrs[0] += cCntLUT_4[0][0][side[i]];
		arrs[1] += cCntLUT_4[0][1][side[i]];
		arrs[2] += cCntLUT_4[0][2][side[i]];
		arrs[3] += cCntLUT_4[0][3][side[i]];
	}
	// Count occurences of c in the rest of the byte
	if(l._bp > 0) {
		arrs[0] += cCntLUT_4[(int)l._bp][0][side[i]];
		arrs[1] += cCntLUT_4[(int)l._bp][1][side[i]];
		arrs[2] += cCntLUT_4[(int)l._bp][2][side[i]];
		arrs[3] += cCntLUT_4[(int)l._bp][3][side[i]];
	}
#endif
}

/**
 * Count all occurrences of character c from the beginning of the
 * forward side to <by,bp> and add in the occ[] count up to the side
 * break just prior to the side.
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countFwSide(const SideLocus& l, int c) const { /* check */
	assert_lt(c, 4);
	assert_geq(c, 0);
	assert_lt(l._by, (int)this->_eh._sideBwtSz);
	assert_geq(l._by, 0);
	assert_lt(l._bp, 4);
	assert_geq(l._bp, 0);
	const uint8_t *side = l.side(this->_ebwt);
	TIndexOffU cCnt = countUpTo(l, c);
	assert_leq(cCnt, this->_eh._sideBwtLen);
	if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
		{
			cCnt--; // Adjust for '$' looking like an 'A'
		}
	}
	TIndexOffU ret;
	// Now factor in the occ[] count at the side break
	if(c < 2) {
		const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE);
		assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
		assert_leq(ac[1], this->_eh._len);
		ret = ac[c] + cCnt + this->_fchr[c];
	} else {
		const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE); // next
		assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
		ret = gt[c-2] + cCnt + this->_fchr[c];
	}
#ifndef NDEBUG
	assert_leq(ret, this->_fchr[c+1]); // can't have jumpded into next char's section
	if(c == 0) {
		assert_leq(cCnt, this->_eh._sideBwtLen);
	} else {
		assert_leq(ret, this->_eh._bwtLen);
	}
#endif
	return ret;
}

/**
 * Count all occurrences of character c from the beginning of the
 * forward side to <by,bp> and add in the occ[] count up to the side
 * break just prior to the side.
 */
template<typename TStr>
inline void Ebwt<TStr>::countFwSideEx(const SideLocus& l, TIndexOffU* arrs) const
{
	assert_lt(l._by, (int)this->_eh._sideBwtSz);
	assert_geq(l._by, 0);
	assert_lt(l._bp, 4);
	assert_geq(l._bp, 0);
	countUpToEx(l, arrs);
#ifndef NDEBUG
	assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
	assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
	assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
	assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
#endif
	assert_leq(arrs[0], this->_eh._sideBwtLen);
	assert_leq(arrs[1], this->_eh._sideBwtLen);
	assert_leq(arrs[2], this->_eh._sideBwtLen);
	assert_leq(arrs[3], this->_eh._sideBwtLen);
	const uint8_t *side = l.side(this->_ebwt);
	if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
		{
			arrs[0]--; // Adjust for '$' looking like an 'A'
		}
	}
	// Now factor in the occ[] count at the side break
	const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side - 2*OFF_SIZE);
	const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
#ifndef NDEBUG
	assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
	assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
	assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
	assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
#endif
	assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
	assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
	arrs[0] += (ac[0] + this->_fchr[0]);
	arrs[1] += (ac[1] + this->_fchr[1]);
	arrs[2] += (gt[0] + this->_fchr[2]);
	arrs[3] += (gt[1] + this->_fchr[3]);
#ifndef NDEBUG
	assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
	assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
	assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
	assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
#endif
}

/**
 * Count all instances of character c from <by,bp> to the logical end
 * (actual beginning) of the backward side, and subtract that from the
 * occ[] count up to the side break.
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countBwSide(const SideLocus& l, int c) const {
	assert_lt(c, 4);
	assert_geq(c, 0);
	assert_lt(l._by, (int)this->_eh._sideBwtSz);
	assert_geq(l._by, 0);
	assert_lt(l._bp, 4);
	assert_geq(l._bp, 0);
	const uint8_t *side = l.side(this->_ebwt);
	TIndexOffU cCnt = countUpTo(l, c);
	if(rowL(l) == c) cCnt++;
	assert_leq(cCnt, this->_eh._sideBwtLen);
	if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
		{
			cCnt--;
		}
	}
	TIndexOffU ret;
	// Now factor in the occ[] count at the side break
	if(c < 2) {
		const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
		assert_leq(ac[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
		assert_leq(ac[1], this->_eh._len);
		ret = ac[c] - cCnt + this->_fchr[c];
	} else {
		const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE); // next
		assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
		ret = gt[c-2] - cCnt + this->_fchr[c];
	}
#ifndef NDEBUG
	assert_leq(ret, this->_fchr[c+1]); // can't have jumped into next char's section
	if(c == 0) {
		assert_leq(cCnt, this->_eh._sideBwtLen);
	} else {
		assert_lt(ret, this->_eh._bwtLen);
	}
#endif
	return ret;
}

/**
 * Count all instances of character c from <by,bp> to the logical end
 * (actual beginning) of the backward side, and subtract that from the
 * occ[] count up to the side break.
 */
template<typename TStr>
inline void Ebwt<TStr>::countBwSideEx(const SideLocus& l, TIndexOffU* arrs) const {
	assert_lt(l._by, (int)this->_eh._sideBwtSz);
	assert_geq(l._by, 0);
	assert_lt(l._bp, 4);
	assert_geq(l._bp, 0);
	const uint8_t *side = l.side(this->_ebwt);
	countUpToEx(l, arrs);
	arrs[rowL(l)]++;
	assert_leq(arrs[0], this->_eh._sideBwtLen);
	assert_leq(arrs[1], this->_eh._sideBwtLen);
	assert_leq(arrs[2], this->_eh._sideBwtLen);
	assert_leq(arrs[3], this->_eh._sideBwtLen);
	if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp >= _zEbwtBpOff))
		{
			arrs[0]--; // Adjust for '$' looking like an 'A'
		}
	}
	// Now factor in the occ[] count at the side break
	const TIndexOffU *ac = reinterpret_cast<const TIndexOffU*>(side + this->_eh._sideSz - 2*OFF_SIZE);
	const TIndexOffU *gt = reinterpret_cast<const TIndexOffU*>(side + (2*this->_eh._sideSz) - 2*OFF_SIZE);
#ifndef NDEBUG
	assert_leq(ac[0], this->_fchr[1] + this->_eh.sideBwtLen());
	assert_leq(ac[1], this->_fchr[2]-this->_fchr[1]);
	assert_leq(gt[0], this->_fchr[3]-this->_fchr[2]);
	assert_leq(gt[1], this->_fchr[4]-this->_fchr[3]);
#endif
	assert_leq(ac[0], this->_eh._len + this->_eh.sideBwtLen()); assert_leq(ac[1], this->_eh._len);
	assert_leq(gt[0], this->_eh._len); assert_leq(gt[1], this->_eh._len);
	arrs[0] = (ac[0] - arrs[0] + this->_fchr[0]);
	arrs[1] = (ac[1] - arrs[1] + this->_fchr[1]);
	arrs[2] = (gt[0] - arrs[2] + this->_fchr[2]);
	arrs[3] = (gt[1] - arrs[3] + this->_fchr[3]);
#ifndef NDEBUG
	assert_leq(arrs[0], this->_fchr[1]); // can't have jumped into next char's section
	assert_leq(arrs[1], this->_fchr[2]); // can't have jumped into next char's section
	assert_leq(arrs[2], this->_fchr[3]); // can't have jumped into next char's section
	assert_leq(arrs[3], this->_fchr[4]); // can't have jumped into next char's section
#endif
}

#define WITHIN_BWT_LEN(x) \
	assert_leq(x[0], this->_eh._sideBwtLen); \
	assert_leq(x[1], this->_eh._sideBwtLen); \
	assert_leq(x[2], this->_eh._sideBwtLen); \
	assert_leq(x[3], this->_eh._sideBwtLen)

#define WITHIN_FCHR(x) \
	assert_leq(x[0], this->fchr()[1]); \
	assert_leq(x[1], this->fchr()[2]); \
	assert_leq(x[2], this->fchr()[3]); \
	assert_leq(x[3], this->fchr()[4])

template<typename TStr>
inline TIndexOffU Ebwt<TStr>::countBt2Side(const SideLocus& l, int c) const {
	assert_range(0, 3, c);
	assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
	assert_range(0, 3, (int)l._bp);
	const uint8_t *side = l.side(this->ebwt());
	TIndexOffU cCnt = countUpTo(l, c);
	assert_leq(cCnt, this->_eh._sideBwtLen);
	if(c == 0 && l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
		{
			cCnt--; // Adjust for '$' looking like an 'A'
		}
	}
	TIndexOffU ret;
	// Now factor in the occ[] count at the side break
	const uint8_t *acgt8 = side + _eh._sideBwtSz;
	const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt8);
	assert_leq(acgt[0], this->_eh._numSides * this->_eh._sideBwtLen); // b/c it's used as padding
	assert_leq(acgt[1], this->_eh._len);
	assert_leq(acgt[2], this->_eh._len);
	assert_leq(acgt[3], this->_eh._len);
	ret = acgt[c] + cCnt + this->fchr()[c];
#ifndef NDEBUG
	assert_leq(ret, this->fchr()[c+1]); // can't have jumpded into next char's section
	if(c == 0) {
		assert_leq(cCnt, this->_eh._sideBwtLen);
	} else {
		assert_leq(ret, this->_eh._bwtLen);
	}
#endif
	return ret;
}

/**
 * Count all occurrences of character c from the beginning of the
 * forward side to <by,bp> and add in the occ[] count up to the side
 * break just prior to the side.
 *
 * A forward side is shaped like:
 *
 * [A] [C] XXXXXXXXXXXXXXXX
 * -4- -4- --------56------ (numbers in bytes)
 *         ^
 *         Side ptr (result from SideLocus.side())
 *
 * And following it is a reverse side shaped like:
 * 
 * [G] [T] XXXXXXXXXXXXXXXX
 * -4- -4- --------56------ (numbers in bytes)
 *         ^
 *         Side ptr (result from SideLocus.side())
 *
 */
template<typename TStr>
inline void Ebwt<TStr>::countBt2SideEx(const SideLocus& l, TIndexOffU* arrs) const {
	assert_range(0, (int)this->_eh._sideBwtSz-1, (int)l._by);
	assert_range(0, 3, (int)l._bp);
	countUpToEx(l, arrs);
	if(l._sideByteOff <= _zEbwtByteOff && l._sideByteOff + l._by >= _zEbwtByteOff) {
		// Adjust for the fact that we represented $ with an 'A', but
		// shouldn't count it as an 'A' here
		if((l._sideByteOff + l._by > _zEbwtByteOff) ||
		   (l._sideByteOff + l._by == _zEbwtByteOff && l._bp > _zEbwtBpOff))
		{
			arrs[0]--; // Adjust for '$' looking like an 'A'
		}
	}
	WITHIN_FCHR(arrs);
	WITHIN_BWT_LEN(arrs);
	// Now factor in the occ[] count at the side break
	const uint8_t *side = l.side(this->ebwt());
	const uint8_t *acgt16 = side + this->_eh._sideSz - OFF_SIZE*4;
	const TIndexOffU *acgt = reinterpret_cast<const TIndexOffU*>(acgt16);
	assert_leq(acgt[0], this->fchr()[1] + this->_eh.sideBwtLen());
	assert_leq(acgt[1], this->fchr()[2]-this->fchr()[1]);
	assert_leq(acgt[2], this->fchr()[3]-this->fchr()[2]);
	assert_leq(acgt[3], this->fchr()[4]-this->fchr()[3]);
	assert_leq(acgt[0], this->_eh._len + this->_eh.sideBwtLen());
	assert_leq(acgt[1], this->_eh._len);
	assert_leq(acgt[2], this->_eh._len);
	assert_leq(acgt[3], this->_eh._len);
	arrs[0] += (acgt[0] + this->fchr()[0]);
	arrs[1] += (acgt[1] + this->fchr()[1]);
	arrs[2] += (acgt[2] + this->fchr()[2]);
	arrs[3] += (acgt[3] + this->fchr()[3]);
	WITHIN_FCHR(arrs);
}

/**
 * Given top and bot loci, calculate counts of all four DNA chars up to
 * those loci.  Used for more advanced backtracking-search.
 */
template<typename TStr>
inline void Ebwt<TStr>::mapLFEx(const SideLocus& ltop,
                                const SideLocus& lbot,
                                TIndexOffU *tops,
                                TIndexOffU *bots
                                ASSERT_ONLY(, bool overrideSanity)
                                ) const
{
	// TODO: Where there's overlap, reuse the count for the overlapping
	// portion
#ifdef EBWT_STATS
	const_cast<Ebwt<TStr>*>(this)->mapLFExs_++;
#endif
	assert_eq(0, tops[0]); assert_eq(0, bots[0]);
	assert_eq(0, tops[1]); assert_eq(0, bots[1]);
	assert_eq(0, tops[2]); assert_eq(0, bots[2]);
	assert_eq(0, tops[3]); assert_eq(0, bots[3]);
	if(ltop._fw) {
		 // Forward side
		!_eh._isBt2Index ? countFwSideEx(ltop, tops)
		                 : countBt2SideEx(ltop, tops);
	} else {
		countBwSideEx(ltop, tops); // Backward side
	}

	if(lbot._fw) {
		// Forward side
		!_eh._isBt2Index ? countFwSideEx(lbot, bots)
		                 : countBt2SideEx(lbot, bots);
	} else {
		countBwSideEx(lbot, bots); // Backward side
	}
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with individual calls to mapLF;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		assert_eq(mapLF(ltop, 0, true), tops[0]);
		assert_eq(mapLF(ltop, 1, true), tops[1]);
		assert_eq(mapLF(ltop, 2, true), tops[2]);
		assert_eq(mapLF(ltop, 3, true), tops[3]);
		assert_eq(mapLF(lbot, 0, true), bots[0]);
		assert_eq(mapLF(lbot, 1, true), bots[1]);
		assert_eq(mapLF(lbot, 2, true), bots[2]);
		assert_eq(mapLF(lbot, 3, true), bots[3]);
	}
#endif
}

#ifndef NDEBUG
/**
 * Given top and bot loci, calculate counts of all four DNA chars up to
 * those loci.  Used for more advanced backtracking-search.
 */
template<typename TStr>
inline void Ebwt<TStr>::mapLFEx(const SideLocus& l,
		TIndexOffU *arrs
                                ASSERT_ONLY(, bool overrideSanity)
                                ) const
{
	assert_eq(0, arrs[0]);
	assert_eq(0, arrs[1]);
	assert_eq(0, arrs[2]);
	assert_eq(0, arrs[3]);
	if(l._fw) {
		// Forward side
		!_eh._isBt2Index ? countFwSideEx(l, arrs)
		                 : countBt2SideEx(l, arrs);
	} else {
		countBwSideEx(l, arrs); // Backward side
	}
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with individual calls to mapLF;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		assert_eq(mapLF(l, 0, true), arrs[0]);
		assert_eq(mapLF(l, 1, true), arrs[1]);
		assert_eq(mapLF(l, 2, true), arrs[2]);
		assert_eq(mapLF(l, 3, true), arrs[3]);
	}
#endif
}
#endif

/**
 * Given row i, return the row that the LF mapping maps i to.
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l
                                  ASSERT_ONLY(, bool overrideSanity)
                                  ) const
{
#ifdef EBWT_STATS
	const_cast<Ebwt<TStr>*>(this)->mapLFs_++;
#endif
	TIndexOffU ret;
	assert(l.side(this->_ebwt) != NULL);
	int c = rowL(l);
	assert_lt(c, 4);
	assert_geq(c, 0);
	if(l._fw) {
		// Forward side
		ret = !_eh._isBt2Index ? countFwSide(l, c)
		                       : countBt2Side(l, c);
	}
	else {
		ret = countBwSide(l, c); // Backward side
	}
	assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with results from mapLFEx;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		TIndexOffU arrs[] = { 0, 0, 0, 0 };
		mapLFEx(l, arrs, true);
		assert_eq(arrs[c], ret);
	}
#endif
	return ret;
}

/**
 * Given row i and character c, return the row that the LF mapping maps
 * i to on character c.
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF(const SideLocus& l, int c
                                  ASSERT_ONLY(, bool overrideSanity)
                                  ) const
{
#ifdef EBWT_STATS
	const_cast<Ebwt<TStr>*>(this)->mapLFcs_++;
#endif
	TIndexOffU ret;
	assert_lt(c, 4);
	assert_geq(c, 0);
	if(l._fw) {
		// Forward side
		ret = !_eh._isBt2Index ? countFwSide(l, c)
		                       : countBt2Side(l, c);
	}
	else {
		ret = countBwSide(l, c); // Backward side
	}
	assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with results from mapLFEx;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		TIndexOffU arrs[] = { 0, 0, 0, 0 };
		mapLFEx(l, arrs, true);
		assert_eq(arrs[c], ret);
	}
#endif
	return ret;
}

/**
 * Given row i and character c, return the row that the LF mapping maps
 * i to on character c.
 */
template<typename TStr>
inline TIndexOffU Ebwt<TStr>::mapLF1(TIndexOffU row, const SideLocus& l, int c
                                   ASSERT_ONLY(, bool overrideSanity)
                                   ) const
{
#ifdef EBWT_STATS
	const_cast<Ebwt<TStr>*>(this)->mapLF1cs_++;
#endif
	if(rowL(l) != c || row == _zOff) return OFF_MASK;
	TIndexOffU ret;
	assert_lt(c, 4);
	assert_geq(c, 0);
	if(l._fw) {
		// Forward side
		ret = !_eh._isBt2Index ? countFwSide(l, c)
		                       : countBt2Side(l, c);
	} else {
		ret = countBwSide(l, c); // Backward side
	}
	assert_lt(ret, this->_eh._bwtLen);
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with results from mapLFEx;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		TIndexOffU arrs[] = { 0, 0, 0, 0 };
		mapLFEx(l, arrs, true);
		assert_eq(arrs[c], ret);
	}
#endif
	return ret;
}

/**
 * Given row i and character c, return the row that the LF mapping maps
 * i to on character c.
 */
template<typename TStr>
inline int Ebwt<TStr>::mapLF1(TIndexOffU& row, const SideLocus& l
                              ASSERT_ONLY(, bool overrideSanity)
                              ) const
{
#ifdef EBWT_STATS
	const_cast<Ebwt<TStr>*>(this)->mapLF1s_++;
#endif
	if(row == _zOff) return -1;
	int c = rowL(l);
	assert_lt(c, 4);
	assert_geq(c, 0);
	if(l._fw) {
		// Forward side
		row = !_eh._isBt2Index ? countFwSide(l, c)
		                       : countBt2Side(l, c);
	} else {
		row = countBwSide(l, c); // Backward side
	}
	assert_lt(row, this->_eh._bwtLen);
#ifndef NDEBUG
	if(_sanity && !overrideSanity) {
		// Make sure results match up with results from mapLFEx;
		// be sure to override sanity-checking in the callee, or we'll
		// have infinite recursion
		TIndexOffU arrs[] = { 0, 0, 0, 0 };
		mapLFEx(l, arrs, true);
		assert_eq(arrs[c], row);
	}
#endif
	return c;
}

/**
 * Take an offset into the joined text and translate it into the
 * reference of the index it falls on, the offset into the reference,
 * and the length of the reference.  Use a binary search through the
 * sorted list of reference fragment ranges t
 */
template<typename TStr>
void Ebwt<TStr>::joinedToTextOff(TIndexOffU qlen, TIndexOffU off,
								TIndexOffU& tidx,
								TIndexOffU& textoff,
								TIndexOffU& tlen) const
{
	TIndexOffU top = 0;
	TIndexOffU bot = _nFrag; // 1 greater than largest addressable element
	TIndexOffU elt = OFF_MASK;
	// Begin binary search
	while(true) {
		ASSERT_ONLY(TIndexOffU oldelt = elt);
		elt = top + ((bot - top) >> 1);
		assert_neq(oldelt, elt); // must have made progress
		TIndexOffU lower = _rstarts[elt*3];
		TIndexOffU upper;
		if(elt == _nFrag-1) {
			upper = _eh._len;
		} else {
			upper = _rstarts[((elt+1)*3)];
		}
		assert_gt(upper, lower);
		TIndexOffU fraglen = upper - lower;
		if(lower <= off) {
			if(upper > off) { // not last element, but it's within
				// off is in this range; check if it falls off
				if(off + qlen > upper) {
					// it falls off; signal no-go and return
					tidx = OFF_MASK;
					assert_lt(elt, _nFrag-1);
					return;
				}
				tidx = _rstarts[(elt*3)+1];
				assert_lt(tidx, this->_nPat);
				assert_leq(fraglen, this->_plen[tidx]);
				// it doesn't fall off; now calculate textoff.
				// Initially it's the number of characters that precede
				// the alignment in the fragment
				uint32_t fragoff = off - _rstarts[(elt*3)];
				if(!this->_fw) {
					fragoff = fraglen - fragoff - 1;
					fragoff -= (qlen-1);
				}
				// Add the alignment's offset into the fragment
				// ('fragoff') to the fragment's offset within the text
				textoff = fragoff + _rstarts[(elt*3)+2];
				assert_lt(textoff, this->_plen[tidx]);
				break; // done with binary search
			} else {
				// 'off' belongs somewhere in the region between elt
				// and bot
				top = elt;
			}
		} else {
			// 'off' belongs somewhere in the region between top and
			// elt
			bot = elt;
		}
		// continue with binary search
	}
	tlen = this->_plen[tidx];
}

/**
 * Report a potential match at offset 'off' with pattern length
 * 'qlen'.  Filter out spurious matches that span texts.
 */
template<typename TStr>
inline bool Ebwt<TStr>::report(const String<Dna5>& query,
                               String<char>* quals,
                               String<char>* name,
                               bool color,
                               char primer,
                               char trimc,
                               bool colExEnds,
                               int snpPhred,
                               const BitPairReference* ref,
                               const std::vector<TIndexOffU>& mmui32,
                               const std::vector<uint8_t>& refcs,
                               size_t numMms,
                               TIndexOffU off,
                               TIndexOffU top,
                               TIndexOffU bot,
                               uint32_t qlen,
                               int stratum,
                               uint16_t cost,
                               uint32_t patid,
                               uint32_t seed,
                               const EbwtSearchParams<TStr>& params) const
{
	VMSG_NL("In report");
	assert_geq(cost, (uint32_t)(stratum << 14));
	assert_lt(off, this->_eh._len);
	TIndexOffU tidx;
	TIndexOffU textoff;
	TIndexOffU tlen;
	joinedToTextOff(qlen, off, tidx, textoff, tlen);
	if(tidx == OFF_MASK) {
		return false;
	}
	return params.reportHit(
			query,                    // read sequence
			quals,                    // read quality values
			name,                     // read name
			color,                    // true -> read is colorspace
			primer,
			trimc,
			colExEnds,                // true -> exclude nucleotides on ends
			snpPhred,                 // phred probability of SNP
			ref,                      // reference sequence
			_fw,                      // true = index is forward; false = mirror
			mmui32,                   // mismatch positions
			refcs,                    // reference characters for mms
			numMms,                   // # mismatches
			make_pair(tidx, textoff), // position
			make_pair<TIndexOffU,TIndexOffU>(0, 0),          // (bogus) mate position
			true,                     // (bogus) mate orientation
			0,                        // (bogus) mate length
			make_pair(top, bot),      // arrows
			tlen,                     // textlen
			qlen,                     // qlen
			stratum,                  // alignment stratum
			cost,                     // cost, including stratum & quality penalty
			bot-top-1,                // # other hits
			patid,                    // pattern id
			seed,                     // pseudo-random seed
			0);                       // mate (0 = unpaired)
}

#include "row_chaser.h"

/**
 * Report a result.  Involves walking backwards along the original
 * string by way of the LF-mapping until we reach a marked SA row or
 * the row corresponding to the 0th suffix.  A marked row's offset
 * into the original string can be read directly from the this->_offs[]
 * array.
 */
template<typename TStr>
inline bool Ebwt<TStr>::reportChaseOne(const String<Dna5>& query,
                                       String<char>* quals,
                                       String<char>* name,
                                       bool color,
                                       char primer,
                                       char trimc,
                                       bool colExEnds,
                                       int snpPhred,
                                       const BitPairReference* ref,
                                       const std::vector<TIndexOffU>& mmui32,
                                       const std::vector<uint8_t>& refcs,
                                       size_t numMms,
                                       TIndexOffU i,
                                       TIndexOffU top,
                                       TIndexOffU bot,
                                       uint32_t qlen,
                                       int stratum,
                                       uint16_t cost,
                                       uint32_t patid,
                                       uint32_t seed,
                                       const EbwtSearchParams<TStr>& params,
                                       SideLocus *l) const
{
	VMSG_NL("In reportChaseOne");
	TIndexOffU off;
	uint32_t jumps = 0;
	ASSERT_ONLY(uint32_t origi = i);
	SideLocus myl;
	const TIndexOffU offMask = this->_eh._offMask;
	const uint32_t offRate = this->_eh._offRate;
	const TIndexOffU* offs = this->_offs;
	// If the caller didn't give us a pre-calculated (and prefetched)
	// locus, then we have to do that now
	if(l == NULL) {
		l = &myl;
		l->initFromRow(i, this->_eh, this->_ebwt);
	}
	assert(l != NULL);
	assert(l->valid());
	// Walk along until we reach the next marked row to the left
	while(((i & offMask) != i) && i != _zOff) {
		// Not a marked row; walk left one more char
		TIndexOffU newi = mapLF(*l); // calc next row
		assert_neq(newi, i);
		i = newi;                                  // update row
		l->initFromRow(i, this->_eh, this->_ebwt); // update locus
		jumps++;
	}
	// This is a marked row
	if(i == _zOff) {
		// Special case: it's the row corresponding to the
		// lexicographically smallest suffix, which is implicitly
		// marked 0
		off = jumps;
		VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
	} else {
		// Normal marked row, calculate offset of row i
		off = offs[i >> offRate] + jumps;
		VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
	}
#ifndef NDEBUG
	{
		uint32_t rcoff = RowChaser<TStr>::toFlatRefOff(this, qlen, origi);
		assert_eq(rcoff, off);
	}
#endif
	return report(query, quals, name, color, primer, trimc, colExEnds,
	              snpPhred, ref, mmui32, refcs, numMms, off, top, bot,
	              qlen, stratum, cost, patid, seed, params);
}

/**
 * Report a result.  Involves walking backwards along the original
 * string by way of the LF-mapping until we reach a marked SA row or
 * the row corresponding to the 0th suffix.  A marked row's offset
 * into the original string can be read directly from the this->_offs[]
 * array.
 */
template<typename TStr>
inline bool Ebwt<TStr>::reportReconstruct(const String<Dna5>& query,
                                          String<char>* quals,
                                          String<char>* name,
                                          String<Dna5>& lbuf,
                                          String<Dna5>& rbuf,
                                          const TIndexOffU *mmui32,
                                          const char* refcs,
                                          size_t numMms,
                                          TIndexOffU i,
                                          TIndexOffU top,
                                          TIndexOffU bot,
                                          uint32_t qlen,
                                          int stratum,
                                          const EbwtSearchParams<TStr>& params,
                                          SideLocus *l) const
{
	VMSG_NL("In reportReconstruct");
	assert_gt(_eh._isaLen, 0); // Must have inverse suffix array to reconstruct
	TIndexOffU off;
	TIndexOffU jumps = 0;
	SideLocus myl;
	const TIndexOffU offMask = this->_eh._offMask;
	const TIndexOffU offRate = this->_eh._offRate;
	const TIndexOffU* offs = this->_offs;
	const TIndexOffU* isa = this->_isa;
	assert(isa != NULL);
	if(l == NULL) {
		l = &myl;
		myl.initFromRow(i, this->_eh, this->_ebwt);
	}
	assert(l != NULL);
	clear(lbuf); clear(rbuf);
	// Walk along until we reach the next marked row to the left
	while(((i & offMask) != i) && i != _zOff) {
		// Not a marked row; walk left one more char
		int c = rowL(*l);
		appendValue(lbuf, (Dna5)c);
		TIndexOffU newi;
		assert_lt(c, 4);
		assert_geq(c, 0);
		if(l->_fw) {
			// Forward side
			newi = !_eh._isBt2Index ? countFwSide(*l, c)
			                        : countBt2Side(*l, c);
		} else {
			newi = countBwSide(*l, c); // Backward side
		}
		assert_lt(newi, this->_eh._bwtLen);
		assert_neq(newi, i);
		i = newi;                                  // update row
		l->initFromRow(i, this->_eh, this->_ebwt); // update locus
		jumps++;
	}
	// This is a marked row
	if(i == _zOff) {
		// Special case: it's the row corresponding to the
		// lexicographically smallest suffix, which is implicitly
		// marked 0
		off = jumps;
		VMSG_NL("reportChaseOne found zoff off=" << off << " (jumps=" << jumps << ")");
	} else {
		// Normal marked row, calculate offset of row i
		off = offs[i >> offRate] + jumps;
		VMSG_NL("reportChaseOne found off=" << off << " (jumps=" << jumps << ")");
	}
	// 'off' now holds the text offset of the first (leftmost) position
	// involved in the alignment.  Next we call joinedToTextOff to
	// check whether the seed is valid (i.e., does not straddle a
	// boundary between two reference seuqences) and to obtain its
	// extents
	TIndexOffU tidx;    // the index (id) of the reference we hit in
	TIndexOffU textoff; // the offset of the alignment within the reference
	TIndexOffU tlen;    // length of reference seed hit in
	joinedToTextOff(qlen, off, tidx, textoff, tlen);
	if(tidx == OFF_MASK) {
		// The seed straddled a reference boundary, and so is spurious.
		// Return false, indicating that we shouldn't stop.
		return false;
	}
	if(jumps > textoff) {
		// In our progress toward a marked row, we passed the boundary
		// between the reference sequence containing the seed and the
		// reference sequence to the left of it.  That's OK, we just
		// need to knock off the extra characters we added to 'lbuf'.
		assert_eq(jumps, length(lbuf));
		_setLength(lbuf, textoff);
		jumps = textoff;
		assert_eq(textoff, length(lbuf));
	} else if(jumps < textoff) {
		// Keep walking until we reach the end of the reference
		assert_neq(i, _zOff);
		TIndexOffU diff = textoff-jumps;
		for(size_t j = 0; j < diff; j++) {
			// Not a marked row; walk left one more char
			int c = rowL(*l);
			appendValue(lbuf, (Dna5)c);
			TIndexOffU newi;
			assert_lt(c, 4);
			assert_geq(c, 0);
			if(l->_fw) {
				// Forward side
				newi = !_eh._isBt2Index ? countFwSide(*l, c)
				                        : countBt2Side(*l, c);
			} else {
				newi = countBwSide(*l, c); // Backward side
			}
			assert_lt(newi, this->_eh._bwtLen);
			assert_neq(newi, i);
			i = newi;                                  // update row
			assert_neq(i, _zOff);
			l->initFromRow(i, this->_eh, this->_ebwt); // update locus
			jumps++;
		}
		assert_eq(textoff, jumps);
		assert_eq(textoff, length(lbuf));
	}
	assert_eq(textoff, jumps);
	assert_eq(textoff, length(lbuf));
	// Calculate the right-hand extent of the reference
	TIndexOffU ref_right = off - textoff + tlen;
	// Round the right-hand extent to the nearest ISA element that maps
	// to it or a character to its right
	TIndexOffU ref_right_rounded = ref_right;
	if((ref_right_rounded & _eh._isaMask) != ref_right_rounded) {
		ref_right_rounded = ((ref_right_rounded >> _eh._isaRate)+1) << _eh._isaRate;
	}
	// TODO: handle case where ref_right_rounded is off the end of _isa
	// Let the current suffix-array elt be determined by the ISA
	if((ref_right_rounded >> _eh._isaRate) >= _eh._isaLen) {
		i = _eh._len;
		ref_right_rounded = _eh._len;
	} else {
		i = isa[ref_right_rounded >> _eh._isaRate];
	}
	TIndexOffU right_steps_rounded = ref_right_rounded - (off + qlen);
	TIndexOffU right_steps = ref_right - (off + qlen);
	l->initFromRow(i, this->_eh, this->_ebwt); // update locus
	for(size_t j = 0; j < right_steps_rounded; j++) {
		// Not a marked row; walk left one more char
		int c = rowL(*l);
		appendValue(rbuf, (Dna5)c);
		TIndexOffU newi;
		assert_lt(c, 4); assert_geq(c, 0);
		if(l->_fw) {
			// Forward side
			newi = !_eh._isBt2Index ? countFwSide(*l, c)
			                        : countBt2Side(*l, c);
		}
		else {
			newi = countBwSide(*l, c); // Backward side
		}
		assert_lt(newi, this->_eh._bwtLen);
		assert_neq(newi, i);
		i = newi;                                  // update row
		assert_neq(i, _zOff);
		l->initFromRow(i, this->_eh, this->_ebwt); // update locus
		jumps++;
	}
	if(right_steps_rounded > right_steps) {
		jumps -= (right_steps_rounded - right_steps);
		_setLength(rbuf, right_steps);
	}
	assert_eq(right_steps, length(rbuf));
	assert_eq(tlen, jumps + qlen);
	::reverseInPlace(lbuf);
	::reverseInPlace(rbuf);
	{
		cout << "reportReconstruct:" << endl
			 << "  " << lbuf << query << rbuf << endl;
		cout << "  ";
		for(size_t i = 0; i < length(lbuf); i++) cout << " ";
		cout << query << endl;
	}
	// Now we've reconstructed the
	return false;
}

/**
 * Transform this Ebwt into the original string in linear time by using
 * the LF mapping to walk backwards starting at the row correpsonding
 * to the end of the string.  The result is written to s.  The Ebwt
 * must be in memory.
 */
template<typename TStr>
void Ebwt<TStr>::restore(TStr& s) const {
	assert(isInMemory());
	resize(s, this->_eh._len, Exact());
	TIndexOffU jumps = 0;
	TIndexOffU i = this->_eh._len; // should point to final SA elt (starting with '$')
	SideLocus l(i, this->_eh, this->_ebwt);
	while(i != _zOff) {
		assert_lt(jumps, this->_eh._len);
		//if(_verbose) cout << "restore: i: " << i << endl;
		// Not a marked row; go back a char in the original string
		TIndexOffU newi = mapLF(l);
		assert_neq(newi, i);
		s[this->_eh._len - jumps - 1] = rowL(l);
		i = newi;
		l.initFromRow(i, this->_eh, this->_ebwt);
		jumps++;
	}
	assert_eq(jumps, this->_eh._len);
}

/**
 * Check that this Ebwt, when restored via restore(), matches up with
 * the given array of reference sequences.  For sanity checking.
 */
template <typename TStr>
void Ebwt<TStr>::checkOrigs(const vector<String<Dna5> >& os,
                            bool color, bool mirror) const
{
	TStr rest;
	restore(rest);
	TIndexOffU restOff = 0;
	size_t i = 0, j = 0;
	if(mirror) {
		// TODO: FIXME
		return;
	}
	while(i < os.size()) {
		size_t olen = length(os[i]);
		int lastorig = -1;
		for(; j < olen; j++) {
			size_t joff = j;
			if(mirror) joff = olen - j - 1;
			if((int)os[i][joff] == 4) {
				// Skip over Ns
				lastorig = -1;
				if(!mirror) {
					while(j < olen && (int)os[i][j] == 4) j++;
				} else {
					while(j < olen && (int)os[i][olen-j-1] == 4) j++;
				}
				j--;
				continue;
			}
			if(lastorig == -1 && color) {
				lastorig = os[i][joff];
				continue;
			}
			if(color) {
				assert_neq(-1, lastorig);
				assert_eq(dinuc2color[(int)os[i][joff]][lastorig], rest[restOff]);
			} else {
				assert_eq(os[i][joff], rest[restOff]);
			}
			lastorig = (int)os[i][joff];
			restOff++;
		}
		if(j == length(os[i])) {
			// Moved to next sequence
			i++;
			j = 0;
		} else {
			// Just jumped over a gap
		}
	}
}

///////////////////////////////////////////////////////////////////////
//
// Functions for reading and writing Ebwts
//
///////////////////////////////////////////////////////////////////////

/**
 * Read an Ebwt from file with given filename.
 */
template<typename TStr>
void Ebwt<TStr>::readIntoMemory(
	int color,
	int needEntireRev,
	bool justHeader,
	EbwtParams *params,
	bool mmSweep,
	bool loadNames,
	bool startVerbose)
{
	bool switchEndian; // dummy; caller doesn't care
#ifdef BOWTIE_MM
	char *mmFile[] = { NULL, NULL };
#endif
	if(_in1Str.length() > 0) {
		if(_verbose || startVerbose) {
			cerr << "  About to open input files: ";
			logTime(cerr);
		}
		// Initialize our primary and secondary input-stream fields
		if(_in1 != NULL) fclose(_in1);
		if(_verbose || startVerbose) cerr << "Opening \"" << _in1Str << "\"" << endl;
		if((_in1 = fopen(_in1Str.c_str(), "rb")) == NULL) {
			cerr << "Could not open index file " << _in1Str << endl;
		}
		if(_in2 != NULL) fclose(_in2);
		if(_verbose || startVerbose) cerr << "Opening \"" << _in2Str << "\"" << endl;
		if((_in2 = fopen(_in2Str.c_str(), "rb")) == NULL) {
			cerr << "Could not open index file " << _in2Str << endl;
		}

		if(_verbose || startVerbose) {
			cerr << "  Finished opening input files: ";
			logTime(cerr);
		}

#ifdef BOWTIE_MM
		if(_useMm /*&& !justHeader*/) {
			const char *names[] = {_in1Str.c_str(), _in2Str.c_str()};
			int fds[] = { fileno(_in1), fileno(_in2) };
			for(int i = 0; i < 2; i++) {
				if(_verbose || startVerbose) {
					cerr << "  Memory-mapping input file " << (i+1) << ": ";
					logTime(cerr);
				}
				struct stat sbuf;
				if (stat(names[i], &sbuf) == -1) {
					perror("stat");
					cerr << "Error: Could not stat index file " << names[i] << " prior to memory-mapping" << endl;
					throw 1;
				}
				mmFile[i] = (char*)mmap((void *)0, sbuf.st_size,
										PROT_READ, MAP_SHARED, fds[i], 0);
				if(mmFile[i] == (void *)(-1)) {
					perror("mmap");
					cerr << "Error: Could not memory-map the index file " << names[i] << endl;
					throw 1;
				}
				if(mmSweep) {
					int sum = 0;
					for(off_t j = 0; j < sbuf.st_size; j += 1024) {
						sum += (int) mmFile[i][j];
					}
					if(startVerbose) {
						cerr << "  Swept the memory-mapped ebwt index file 1; checksum: " << sum << ": ";
						logTime(cerr);
					}
				}
			}
			mmFile1_ = mmFile[0];
			mmFile2_ = mmFile[1];
		}
#endif
	}
#ifdef BOWTIE_MM
	else if(_useMm && !justHeader) {
		mmFile[0] = mmFile1_;
		mmFile[1] = mmFile2_;
	}
	if(_useMm && !justHeader) {
		assert(mmFile[0] == mmFile1_);
		assert(mmFile[1] == mmFile2_);
	}
#endif

	if(_verbose || startVerbose) {
		cerr << "  Reading header: ";
		logTime(cerr);
	}

	// Read endianness hints from both streams
	uint64_t bytesRead = 0;
	switchEndian = false;
	uint32_t one = readU<uint32_t>(_in1, switchEndian); // 1st word of primary stream
	bytesRead += 4;
	#ifndef NDEBUG
	assert_eq(one, readU<uint32_t>(_in2, switchEndian)); // should match!
	#else
	readU<uint32_t>(_in2, switchEndian);
	#endif
	if(one != 1) {
		assert_eq((1u<<24), one);
		assert_eq(1, endianSwapU32(one));
		switchEndian = true;
	}

	// Can't switch endianness and use memory-mapped files; in order to
	// support this, someone has to modify the file to switch
	// endiannesses appropriately, and we can't do this inside Bowtie
	// or we might be setting up a race condition with other processes.
	if(switchEndian && _useMm) {
		cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
		throw 1;
	}

	// Reads header entries one by one from primary stream
	TIndexOffU len          = readU<TIndexOffU>(_in1, switchEndian);
	bytesRead += OFF_SIZE;
	int32_t  lineRate     = readI<int32_t>(_in1, switchEndian);
	bytesRead += 4;
	int32_t  linesPerSide = readI<int32_t>(_in1, switchEndian);
	bytesRead += 4;
	int32_t  offRate      = readI<int32_t>(_in1, switchEndian);
	bytesRead += 4;
	// TODO: add isaRate to the actual file format (right now, the
	// user has to tell us whether there's an ISA sample and what the
	// sampling rate is.
	int32_t  isaRate      = _overrideIsaRate;
	int32_t  ftabChars    = readI<int32_t>(_in1, switchEndian);
	bytesRead += 4;
	// chunkRate was deprecated in an earlier version of Bowtie; now
	// we use it to hold flags.
	int32_t flags = readI<int32_t>(_in1, switchEndian);
	bool entireRev = false;
	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
		if(color != -1 && !color) {
			cerr << "Error: -C was not specified when running bowtie, but index is in colorspace.  If" << endl
			     << "your reads are in colorspace, please use the -C option.  If your reads are not" << endl
			     << "in colorspace, please use a normal index (one built without specifying -C to" << endl
			     << "bowtie-build)." << endl;
			throw 1;
		}
		color = 1;
	} else if(flags < 0) {
		if(color != -1 && color) {
			cerr << "Error: -C was specified when running bowtie, but index is not in colorspace.  If" << endl
			     << "your reads are in colorspace, please use a colorspace index (one built using" << endl
			     << "bowtie-build -C).  If your reads are not in colorspace, don't specify -C when" << endl
			     << "running bowtie." << endl;
			throw 1;
		}
		color = 0;
	}
	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) == 0)) {
		if(needEntireRev != -1 && needEntireRev != 0) {
			cerr << "Error: This index is not compatible with this version of bowtie.  Please use a" << endl
			     << "current version of bowtie-build." << endl;
			throw 1;
		}
	} else entireRev = true;
	bytesRead += 4;

	// Create a new EbwtParams from the entries read from primary stream
	EbwtParams *eh;
	bool deleteEh = false;
	if(params != NULL) {
		params->init(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev, _isBt2Index);
		if(_verbose || startVerbose) params->print(cerr);
		eh = params;
	} else {
		eh = new EbwtParams(len, lineRate, linesPerSide, offRate, isaRate, ftabChars, color, entireRev, _isBt2Index);
		deleteEh = true;
	}

	// Set up overridden suffix-array-sample parameters
	TIndexOffU offsLen = eh->_offsLen;
	uint64_t offsSz = eh->_offsSz;
	TIndexOffU offRateDiff = 0;
	TIndexOffU offsLenSampled = offsLen;
	if(_overrideOffRate > offRate) {
		offRateDiff = _overrideOffRate - offRate;
	}
	if(offRateDiff > 0) {
		offsLenSampled >>= offRateDiff;
		if((offsLen & ~(OFF_MASK << offRateDiff)) != 0) {
			offsLenSampled++;
		}
	}

	// Set up overridden inverted-suffix-array-sample parameters
	TIndexOffU isaLen = eh->_isaLen;
	TIndexOffU isaRateDiff = 0;
	TIndexOffU isaLenSampled = isaLen;
	if(_overrideIsaRate > isaRate) {
		isaRateDiff = _overrideIsaRate - isaRate;
	}
	if(isaRateDiff > 0) {
		isaLenSampled >>= isaRateDiff;
		if((isaLen & ~(OFF_MASK << isaRateDiff)) != 0) {
			isaLenSampled++;
		}
	}

	// Can't override the offrate or isarate and use memory-mapped
	// files; ultimately, all processes need to copy the sparser sample
	// into their own memory spaces.
	if(_useMm && (offRateDiff || isaRateDiff)) {
		cerr << "Error: Can't use memory-mapped files when the offrate or isarate is overridden" << endl;
		throw 1;
	}

	// Read nPat from primary stream
	this->_nPat = readI<TIndexOffU>(_in1, switchEndian);
	bytesRead += OFF_SIZE;
	if(this->_plen != NULL && !_useMm) {
		// Delete it so that we can re-read it
		delete[] this->_plen;
		this->_plen = NULL;
	}

	// Read plen from primary stream
	if(_useMm) {
#ifdef BOWTIE_MM
		this->_plen = (TIndexOffU*)(mmFile[0] + bytesRead);
		bytesRead += this->_nPat*OFF_SIZE;
		fseeko(_in1, this->_nPat*OFF_SIZE, SEEK_CUR);
#endif
	} else {
		try {
			if(_verbose || startVerbose) {
				cerr << "Reading plen (" << this->_nPat << "): ";
				logTime(cerr);
			}
			this->_plen = new TIndexOffU[this->_nPat];
			if(switchEndian) {
				for(TIndexOffU i = 0; i < this->_nPat; i++) {
					this->_plen[i] = readU<TIndexOffU>(_in1, switchEndian);
				}
			} else {
				size_t r = MM_READ(_in1, (void*)this->_plen, this->_nPat*OFF_SIZE);
				if(r != (size_t)(this->_nPat*OFF_SIZE)) {
					cerr << "Error reading _plen[] array: " << r << ", " << (this->_nPat*OFF_SIZE) << endl;
					throw 1;
				}
			}
		} catch(bad_alloc& e) {
			cerr << "Out of memory allocating plen[] in Ebwt::read()"
				 << " at " << __FILE__ << ":" << __LINE__ << endl;
			throw e;
		}
	}

	bool shmemLeader;

	// TODO: I'm not consistent on what "header" means.  Here I'm using
	// "header" to mean everything that would exist in memory if we
	// started to build the Ebwt but stopped short of the build*() step
	// (i.e. everything up to and including join()).
	if(justHeader) goto done;

	this->_nFrag = readU<TIndexOffU>(_in1, switchEndian);
	bytesRead += OFF_SIZE;
	if(_verbose || startVerbose) {
		cerr << "Reading rstarts (" << this->_nFrag*3 << "): ";
		logTime(cerr);
	}
	assert_geq(this->_nFrag, this->_nPat);
	if(_useMm) {
#ifdef BOWTIE_MM
		this->_rstarts = (TIndexOffU*)(mmFile[0] + bytesRead);
		bytesRead += this->_nFrag*OFF_SIZE*3;
		fseeko(_in1, this->_nFrag*OFF_SIZE*3, SEEK_CUR);
#endif
	} else {
		this->_rstarts = new TIndexOffU[this->_nFrag*3];
		if(switchEndian) {
			for(TIndexOffU i = 0; i < this->_nFrag*3; i += 3) {
				// fragment starting position in joined reference
				// string, text id, and fragment offset within text
				this->_rstarts[i]   = readU<TIndexOffU>(_in1, switchEndian);
				this->_rstarts[i+1] = readU<TIndexOffU>(_in1, switchEndian);
				this->_rstarts[i+2] = readU<TIndexOffU>(_in1, switchEndian);
			}
		} else {
			size_t r = MM_READ(_in1, (void *)this->_rstarts, this->_nFrag*OFF_SIZE*3);
			if(r != (size_t)(this->_nFrag*OFF_SIZE*3)) {
				cerr << "Error reading _rstarts[] array: " << r << ", " << (this->_nFrag*OFF_SIZE*3) << endl;
				throw 1;
			}
		}
	}

	if(_useMm) {
#ifdef BOWTIE_MM
		this->_ebwt = (uint8_t*)(mmFile[0] + bytesRead);
		bytesRead += eh->_ebwtTotLen;
		fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
#endif
	} else {
		// Allocate ebwt (big allocation)
		if(_verbose || startVerbose) {
			cerr << "Reading ebwt (" << eh->_ebwtTotLen << "): ";
			logTime(cerr);
		}
		bool shmemLeader = true;
		if(useShmem_) {
			shmemLeader = ALLOC_SHARED_U8(
				(_in1Str + "[ebwt]"), eh->_ebwtTotLen, &this->_ebwt,
				"ebwt[]", (_verbose || startVerbose));
			if(_verbose || startVerbose) {
				cerr << "  shared-mem " << (shmemLeader ? "leader" : "follower") << endl;
			}
		} else {
			try {
				this->_ebwt = new uint8_t[eh->_ebwtTotLen];
			} catch(bad_alloc& e) {
				cerr << "Out of memory allocating the ebwt[] array for the Bowtie index.  Please try" << endl
				     << "again on a computer with more memory." << endl;
				throw 1;
			}
		}
		if(shmemLeader) {
			// Read ebwt from primary stream
			uint64_t bytesLeft = eh->_ebwtTotLen;
			char *pebwt = (char*)this->ebwt();

			while (bytesLeft>0){
				size_t r = MM_READ(_in1, (void *)pebwt, bytesLeft);
				if(MM_IS_IO_ERR(_in1,r,bytesLeft)) {
					cerr << "Error reading ebwt array: returned " << r << ", length was " << (eh->_ebwtTotLen) << endl
					     << "Your index files may be corrupt; please try re-building or re-downloading." << endl
					     << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl
					     << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt.  The XYZ.1.ebwt and " << endl
					     << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl
					     << "XYZ.rev.2.ebwt files." << endl;
					throw 1;
				}
				pebwt += r;
				bytesLeft -= r;
			}
			if(switchEndian) {
				uint8_t *side = this->_ebwt;
				for(size_t i = 0; i < eh->_numSides; i++) {
					TIndexOffU *cums = reinterpret_cast<TIndexOffU*>(side + eh->_sideSz - 2*OFF_SIZE);
					cums[0] = endianSwapU(cums[0]);
					cums[1] = endianSwapU(cums[1]);
					side += this->_eh._sideSz;
				}
			}
			if(useShmem_) NOTIFY_SHARED(this->_ebwt, eh->_ebwtTotLen);
		} else {
			// Seek past the data and wait until master is finished
			fseeko(_in1, eh->_ebwtTotLen, SEEK_CUR);
			if(useShmem_) WAIT_SHARED(this->_ebwt, eh->_ebwtTotLen);
		}
	}

	// Read zOff from primary stream
	_zOff = readU<TIndexOffU>(_in1, switchEndian);
	bytesRead += OFF_SIZE;
	assert_lt(_zOff, len);

	try {
		// Read fchr from primary stream
		if(_verbose || startVerbose) cerr << "Reading fchr (5)" << endl;
		if(_useMm) {
#ifdef BOWTIE_MM
			this->_fchr = (TIndexOffU*)(mmFile[0] + bytesRead);
			bytesRead += 5*OFF_SIZE;
			fseeko(_in1, 5*OFF_SIZE, SEEK_CUR);
#endif
		} else {
			this->_fchr = new TIndexOffU[5];
			for(int i = 0; i < 5; i++) {
				this->_fchr[i] = readU<TIndexOffU>(_in1, switchEndian);
				assert_leq(this->_fchr[i], len);
				if(i > 0) assert_geq(this->_fchr[i], this->_fchr[i-1]);
			}
		}
		assert_gt(this->_fchr[4], this->_fchr[0]);
		// Read ftab from primary stream
		if(_verbose || startVerbose) {
			cerr << "Reading ftab (" << eh->_ftabLen << "): ";
			logTime(cerr);
		}
		if(_useMm) {
#ifdef BOWTIE_MM
			this->_ftab = (TIndexOffU*)(mmFile[0] + bytesRead);
			bytesRead += eh->_ftabLen*OFF_SIZE;
			fseeko(_in1, eh->_ftabLen*OFF_SIZE, SEEK_CUR);
#endif
		} else {
			this->_ftab = new TIndexOffU[eh->_ftabLen];
			if(switchEndian) {
				for(TIndexOffU i = 0; i < eh->_ftabLen; i++)
					this->_ftab[i] = readU<TIndexOffU>(_in1, switchEndian);
			} else {
				size_t r = MM_READ(_in1, (void *)this->_ftab, eh->_ftabLen*OFF_SIZE);
				if(r != (size_t)(eh->_ftabLen*OFF_SIZE)) {
					cerr << "Error reading _ftab[] array: " << r << ", " << (eh->_ftabLen*OFF_SIZE) << endl;
					throw 1;
				}
			}
		}
		// Read etab from primary stream
		if(_verbose || startVerbose) {
			cerr << "Reading eftab (" << eh->_eftabLen << "): ";
			logTime(cerr);
		}
		if(_useMm) {
#ifdef BOWTIE_MM
			this->_eftab = (TIndexOffU*)(mmFile[0] + bytesRead);
			bytesRead += eh->_eftabLen*OFF_SIZE;
			fseeko(_in1, eh->_eftabLen*OFF_SIZE, SEEK_CUR);
#endif
		} else {
			this->_eftab = new TIndexOffU[eh->_eftabLen];
			if(switchEndian) {
				for(TIndexOffU i = 0; i < eh->_eftabLen; i++)
					this->_eftab[i] = readU<TIndexOffU>(_in1, switchEndian);
			} else {
				size_t r = MM_READ(_in1, (void *)this->_eftab, eh->_eftabLen*OFF_SIZE);
				if(r != (size_t)(eh->_eftabLen*OFF_SIZE)) {
					cerr << "Error reading _eftab[] array: " << r << ", " << (eh->_eftabLen*OFF_SIZE) << endl;
					throw 1;
				}
			}
		}
		for(TIndexOffU i = 0; i < eh->_eftabLen; i++) {
			if(i > 0 && this->_eftab[i] > 0) {
				assert_geq(this->_eftab[i], this->_eftab[i-1]);
			} else if(i > 0 && this->_eftab[i-1] == 0) {
				assert_eq(0, this->_eftab[i]);
			}
		}
	} catch(bad_alloc& e) {
		cerr << "Out of memory allocating fchr[], ftab[] or eftab[] arrays for the Bowtie index." << endl
		     << "Please try again on a computer with more memory." << endl;
		throw 1;
	}

	// Read reference sequence names from primary index file (or not,
	// if --refidx is specified)
	if(loadNames) {
		while(true) {
			char c = '\0';
			if(MM_READ(_in1, (void *)(&c), (size_t)1) != (size_t)1) break;
			bytesRead++;
			if(c == '\0') break;
			else if(c == '\n') {
				this->_refnames.push_back("");
			} else {
				if(this->_refnames.size() == 0) {
					this->_refnames.push_back("");
				}
				this->_refnames.back().push_back(c);
			}
		}
	}

	bytesRead = 4; // reset for secondary index file (already read 1-sentinel)

	shmemLeader = true;
	if(_verbose || startVerbose) {
		cerr << "Reading offs (" << offsLenSampled << " 32-bit words): ";
		logTime(cerr);
	}
	if(!_useMm) {
		if(!useShmem_) {
			// Allocate offs_
			try {
				this->_offs = new TIndexOffU[offsLenSampled];
			} catch(bad_alloc& e) {
				cerr << "Out of memory allocating the offs[] array  for the Bowtie index." << endl
					 << "Please try again on a computer with more memory." << endl;
				throw 1;
			}
		} else {
			shmemLeader = ALLOC_SHARED_U(
				(_in2Str + "[offs]"), offsLenSampled*OFF_SIZE, &this->_offs,
				"offs", (_verbose || startVerbose));
		}
	}

	if(_overrideOffRate < 32) {
		if(shmemLeader) {
			// Allocate offs (big allocation)
			if(switchEndian || offRateDiff > 0) {
				assert(!_useMm);
				const TIndexOffU blockMaxSz = (2 * 1024 * 1024); // 2 MB block size
				const TIndexOffU blockMaxSzU = (blockMaxSz >> (OFF_SIZE/4 +1)); // # U32s per block
				char *buf = new char[blockMaxSz];
				for(TIndexOffU i = 0; i < offsLen; i += blockMaxSzU) {
					TIndexOffU block = min<TIndexOffU>(blockMaxSzU, offsLen - i);
					size_t r = MM_READ(_in2, (void *)buf, block << (OFF_SIZE/4 + 1));
					if(r != (size_t)(block << (OFF_SIZE/4 + 1))) {
						cerr << "Error reading block of offs array: " << r << ", " << (block << (OFF_SIZE/4 + 1)) << endl
						     << "Your index files may be corrupt; please try re-building or re-downloading." << endl
						     << "A complete index consists of 6 files: XYZ.1.ebwt, XYZ.2.ebwt, XYZ.3.ebwt," << endl
						     << "XYZ.4.ebwt, XYZ.rev.1.ebwt, and XYZ.rev.2.ebwt.  The XYZ.1.ebwt and " << endl
						     << "XYZ.rev.1.ebwt files should have the same size, as should the XYZ.2.ebwt and" << endl
						     << "XYZ.rev.2.ebwt files." << endl;
						throw 1;
					}
					TIndexOffU idx = i >> offRateDiff;
					for(TIndexOffU j = 0; j < block; j += (1 << offRateDiff)) {
						assert_lt(idx, offsLenSampled);
						this->_offs[idx] = ((TIndexOffU*)buf)[j];
						if(switchEndian) {
							this->_offs[idx] = endianSwapU(this->_offs[idx]);
						}
						idx++;
					}
				}
				delete[] buf;
			} else {
				if(_useMm) {
#ifdef BOWTIE_MM
					this->_offs = (TIndexOffU*)(mmFile[1] + bytesRead);
					bytesRead += offsSz;
					// Argument to lseek can be 64 bits if compiled with
					// _FILE_OFFSET_BITS
					fseeko(_in2, offsSz, SEEK_CUR);
#endif
				} else {
					// If any of the high two bits are set
					// Workaround for small-index mode where MM_READ may
					// not be able to handle read amounts greater than 2^32
					// bytes.
					uint64_t bytesLeft = offsSz;
					char *offs = (char *)this->offs();

					while(bytesLeft > 0) {
						size_t r = MM_READ(_in2, (void*)offs, bytesLeft);
						if(MM_IS_IO_ERR(_in2,r,bytesLeft)) {
							cerr << "Error reading block of _offs[] array: "
							     << r << ", " << bytesLeft << gLastIOErrMsg << endl;
							throw 1;
						}
						offs += r;
						bytesLeft -= r;
					}
				}
			}

			{
				ASSERT_ONLY(Bitset offsSeen(len+1));
				for(TIndexOffU i = 0; i < offsLenSampled; i++) {
					assert(!offsSeen.test(this->_offs[i]));
					ASSERT_ONLY(offsSeen.set(this->_offs[i]));
					assert_leq(this->_offs[i], len);
				}
			}

			if(useShmem_) NOTIFY_SHARED(this->_offs, offsLenSampled*OFF_SIZE);
		} else {
			// Not the shmem leader
			fseeko(_in2, offsLenSampled*OFF_SIZE, SEEK_CUR);
			if(useShmem_) WAIT_SHARED(this->_offs, offsLenSampled*OFF_SIZE);
		}
	}

	// Allocate _isa[] (big allocation)
	if(_verbose || startVerbose) {
		cerr << "Reading isa (" << isaLenSampled << "): ";
		logTime(cerr);
	}
	if(!_useMm) {
		try {
			this->_isa = new TIndexOffU[isaLenSampled];
		} catch(bad_alloc& e) {
			cerr << "Out of memory allocating the isa[] array  for the Bowtie index." << endl
				 << "Please try again on a computer with more memory." << endl;
			throw 1;
		}
	}
	// Read _isa[]
	if(switchEndian || isaRateDiff > 0) {
		assert(!_useMm);
		for(TIndexOffU i = 0; i < isaLen; i++) {
			if((i & ~(OFF_MASK << isaRateDiff)) != 0) {
				char tmp[OFF_SIZE];
				size_t r = MM_READ(_in2, (void *)tmp, OFF_SIZE);
				if(r != (size_t)OFF_SIZE) {
					cerr << "Error reading a word of the _isa[] array: " << r << ", 4" << endl;
					throw 1;
				}
			} else {
				TIndexOffU idx = i >> isaRateDiff;
				assert_lt(idx, isaLenSampled);
				this->_isa[idx] = readU<TIndexOffU>(_in2, switchEndian);
			}
		}
	} else {
		if(_useMm) {
#ifdef BOWTIE_MM
			this->_isa = (TIndexOffU*)(mmFile[1] + bytesRead);
			bytesRead += (isaLen << 2);
			fseeko(_in2, (isaLen << 2), SEEK_CUR);
#endif
		} else {
			size_t r = MM_READ(_in2, (void *)this->_isa, isaLen*OFF_SIZE);
			if(r != (size_t)(isaLen*OFF_SIZE)) {
				cerr << "Error reading _isa[] array: " << r << ", " << (isaLen*OFF_SIZE) << endl;
				throw 1;
			}
		}
	}

	{
		ASSERT_ONLY(Bitset isasSeen(len+1));
		for(TIndexOffU i = 0; i < isaLenSampled; i++) {
			assert(!isasSeen.test(this->_isa[i]));
			ASSERT_ONLY(isasSeen.set(this->_isa[i]));
			assert_leq(this->_isa[i], len);
		}
	}

	this->postReadInit(*eh); // Initialize fields of Ebwt not read from file
	if(_verbose || startVerbose) print(cerr, *eh);

	// The fact that _ebwt and friends actually point to something
	// (other than NULL) now signals to other member functions that the
	// Ebwt is loaded into memory.

  done: // Exit hatch for both justHeader and !justHeader

	// Be kind
	if(deleteEh) delete eh;
	if (_in1 != NULL) rewind(_in1);
	if (_in2 != NULL) rewind(_in2);
}

/**
 * Read reference names from an input stream 'in' for an Ebwt primary
 * file and store them in 'refnames'.
 * TODO: revisit this function
 */
static inline void
readEbwtRefnames(FILE* fin, vector<string>& refnames) {
	// _in1 must already be open with the get cursor at the
	// beginning and no error flags set.
	assert(fin != NULL);
	assert_eq(ftello(fin), 0);

	// Read endianness hints from both streams
	bool switchEndian = false;
	uint32_t one = readU<uint32_t>(fin, switchEndian); // 1st word of primary stream
	if(one != 1) {
		assert_eq((1u<<24), one);
		switchEndian = true;
	}

	// Reads header entries one by one from primary stream
	TIndexOffU len          = readU<TIndexOffU>(fin, switchEndian);
	int32_t  lineRate     = readI<int32_t>(fin, switchEndian);
	int32_t  linesPerSide = readI<int32_t>(fin, switchEndian);
	int32_t  offRate      = readI<int32_t>(fin, switchEndian);
	int32_t  ftabChars    = readI<int32_t>(fin, switchEndian);
	// BTL: chunkRate is now deprecated
	int32_t flags = readI<int32_t>(fin, switchEndian);
	bool color = false;
	bool entireReverse = false;
	if(flags < 0) {
		color = (((-flags) & EBWT_COLOR) != 0);
		entireReverse = (((-flags) & EBWT_ENTIRE_REV) != 0);
	}

	// Create a new EbwtParams from the entries read from primary stream
	bool isBt2Index = false;
	if (gEbwt_ext == "bt2" || gEbwt_ext == "bt2") {
		isBt2Index = true;
	}
	EbwtParams eh(len, lineRate, linesPerSide, offRate, -1, ftabChars, color, entireReverse, isBt2Index);

	TIndexOffU nPat = readI<TIndexOffU>(fin, switchEndian); // nPat
	fseeko(fin, nPat*OFF_SIZE, SEEK_CUR);

	// Skip rstarts
	TIndexOffU nFrag = readU<TIndexOffU>(fin, switchEndian);
	fseeko(fin, nFrag*OFF_SIZE*3, SEEK_CUR);

	// Skip ebwt
	fseeko(fin, eh._ebwtTotLen, SEEK_CUR);

	// Skip zOff from primary stream
	readU<TIndexOffU>(fin, switchEndian);

	// Skip fchr
	fseeko(fin, 5 * OFF_SIZE, SEEK_CUR);

	// Skip ftab
	fseeko(fin, eh._ftabLen*OFF_SIZE, SEEK_CUR);

	// Skip eftab
	fseeko(fin, eh._eftabLen*OFF_SIZE, SEEK_CUR);

	// Read reference sequence names from primary index file
	while(true) {
		char c = '\0';
		int read_value = 0;
        read_value = fgetc(fin);
		if(read_value == EOF) break;
        c = read_value;
		if(c == '\0') break;
		else if(c == '\n') {
			refnames.push_back("");
		} else {
			if(refnames.size() == 0) {
				refnames.push_back("");
			}
			refnames.back().push_back(c);
		}
	}
	if(refnames.back().empty()) {
		refnames.pop_back();
	}

	// Be kind
    fseeko(fin, 0, SEEK_SET);
	assert(ferror(fin) == 0);
}

/**
 * Read reference names from the index with basename 'in' and store
 * them in 'refnames'.
 */
static inline void
readEbwtRefnames(const string& instr, vector<string>& refnames) {
    FILE* fin;
	// Initialize our primary and secondary input-stream fields
    fin = fopen((instr + ".1." + gEbwt_ext).c_str(),"rb");
	if(fin == NULL) {
		throw EbwtFileOpenException("Cannot open file " + instr);
	}
	assert_eq(ftello(fin), 0);
	readEbwtRefnames(fin, refnames);
    fclose(fin);
}

/**
 * Read just enough of the Ebwt's header to get its flags
 */
static inline int32_t readFlags(const string& instr) {
	ifstream in;
	// Initialize our primary and secondary input-stream fields
	in.open((instr + ".1." + gEbwt_ext).c_str(), ios_base::in | ios::binary);
	if(!in.is_open()) {
		throw EbwtFileOpenException("Cannot open file " + instr);
	}
	assert(in.is_open());
	assert(in.good());
	bool switchEndian = false;
	uint32_t one = readU<uint32_t>(in, switchEndian); // 1st word of primary stream
	if(one != 1) {
		assert_eq((1u<<24), one);
		assert_eq(1, endianSwapU32(one));
		switchEndian = true;
	}
	readU<TIndexOffU>(in, switchEndian);
	readI<int32_t>(in, switchEndian);
	readI<int32_t>(in, switchEndian);
	readI<int32_t>(in, switchEndian);
	readI<int32_t>(in, switchEndian);
	int32_t flags = readI<int32_t>(in, switchEndian);
	return flags;
}

/**
 * Read just enough of the Ebwt's header to determine whether it's
 * colorspace.
 */
static inline bool
readEbwtColor(const string& instr) {
	int32_t flags = readFlags(instr);
	if(flags < 0 && (((-flags) & EBWT_COLOR) != 0)) {
		return true;
	} else {
		return false;
	}
}

/**
 * Read just enough of the Ebwt's header to determine whether it's
 * entirely reversed.
 */
static inline bool
readEntireReverse(const string& instr) {
	int32_t flags = readFlags(instr);
	if(flags < 0 && (((-flags) & EBWT_ENTIRE_REV) != 0)) {
		return true;
	} else {
		return false;
	}
}

/**
 * Write an extended Burrows-Wheeler transform to a pair of output
 * streams.
 *
 * @param out1 output stream to primary file
 * @param out2 output stream to secondary file
 * @param be   write in big endian?
 */
template<typename TStr>
void Ebwt<TStr>::writeFromMemory(bool justHeader,
                                 ostream& out1,
                                 ostream& out2) const
{
	const EbwtParams& eh = this->_eh;
	assert(eh.repOk());
	uint32_t be = this->toBe();
	assert(out1.good());
	assert(out2.good());

	// When building an Ebwt, these header parameters are known
	// "up-front", i.e., they can be written to disk immediately,
	// before we join() or buildToDisk()
	writeI<int32_t>(out1, 1, be); // endian hint for priamry stream
	writeI<int32_t>(out2, 1, be); // endian hint for secondary stream
	writeU<TIndexOffU>(out1, eh._len,          be); // length of string (and bwt and suffix array)
	writeI<int32_t>(out1, eh._lineRate,     be); // 2^lineRate = size in bytes of 1 line
	writeI<int32_t>(out1, eh._linesPerSide, be); // not used
	writeI<int32_t>(out1, eh._offRate,      be); // every 2^offRate chars is "marked"
	writeI<int32_t>(out1, eh._ftabChars,    be); // number of 2-bit chars used to address ftab
	int32_t flags = 1;
	if(eh._color) flags |= EBWT_COLOR;
	if(eh._entireReverse) flags |= EBWT_ENTIRE_REV;
	writeI<int32_t>(out1, -flags, be); // BTL: chunkRate is now deprecated

	if(!justHeader) {
		assert(isInMemory());
		// These Ebwt parameters are known after the inputs strings have
		// been joined() but before they have been built().  These can
		// written to the disk next and then discarded from memory.
		writeU<TIndexOffU>(out1, this->_nPat,      be);
		for(TIndexOffU i = 0; i < this->_nPat; i++)
			writeU<TIndexOffU>(out1, this->_plen[i], be);
		assert_geq(this->_nFrag, this->_nPat);
		writeU<TIndexOffU>(out1, this->_nFrag, be);
		for(TIndexOffU i = 0; i < this->_nFrag*3; i++)
			writeU<TIndexOffU>(out1, this->_rstarts[i], be);

		// These Ebwt parameters are discovered only as the Ebwt is being
		// built (in buildToDisk()).  Of these, only 'offs' and 'ebwt' are
		// terribly large.  'ebwt' is written to the primary file and then
		// discarded from memory as it is built; 'offs' is similarly
		// written to the secondary file and discarded.
		out1.write((const char *)this->ebwt(), eh._ebwtTotLen);
		writeU<TIndexOffU>(out1, this->zOff(), be);
		TIndexOffU offsLen = eh._offsLen;
		for(TIndexOffU i = 0; i < offsLen; i++)
			writeU<TIndexOffU>(out2, this->_offs[i], be);
		uint32_t isaLen = eh._isaLen;
		for(TIndexOffU i = 0; i < isaLen; i++)
			writeU<TIndexOffU>(out2, this->_isa[i], be);

		// 'fchr', 'ftab' and 'eftab' are not fully determined until the
		// loop is finished, so they are written to the primary file after
		// all of 'ebwt' has already been written and only then discarded
		// from memory.
		for(int i = 0; i < 5; i++)
			writeU<TIndexOffU>(out1, this->_fchr[i], be);
		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
			writeU<TIndexOffU>(out1, this->ftab()[i], be);
		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
			writeU<TIndexOffU>(out1, this->eftab()[i], be);
	}
}

/**
 * Given a pair of strings representing output filenames, and assuming
 * this Ebwt object is currently in memory, write out this Ebwt to the
 * specified files.
 *
 * If sanity-checking is enabled, then once the streams have been
 * fully written and closed, we reopen them and read them into a
 * (hopefully) exact copy of this Ebwt.  We then assert that the
 * current Ebwt and the copy match in all of their fields.
 */
template<typename TStr>
void Ebwt<TStr>::writeFromMemory(bool justHeader,
                                 const string& out1,
                                 const string& out2) const
{
	const EbwtParams& eh = this->_eh;
	assert(isInMemory());
	assert(eh.repOk());

	ofstream fout1(out1.c_str(), ios::binary);
	ofstream fout2(out2.c_str(), ios::binary);
	writeFromMemory(justHeader, fout1, fout2);
	fout1.close();
	fout2.close();

	// Read the file back in and assert that all components match
	if(_sanity) {
		if(_verbose)
			cout << "Re-reading \"" << out1 << "\"/\"" << out2 << "\" for sanity check" << endl;
		Ebwt copy(out1, out2, _verbose, _sanity);
		assert(!isInMemory());
		copy.loadIntoMemory(eh._color ? 1 : 0, -1, false, false);
		assert(isInMemory());
	    assert_eq(eh._lineRate,     copy.eh()._lineRate);
	    assert_eq(eh._linesPerSide, copy.eh()._linesPerSide);
	    assert_eq(eh._offRate,      copy.eh()._offRate);
	    assert_eq(eh._isaRate,      copy.eh()._isaRate);
	    assert_eq(eh._ftabChars,    copy.eh()._ftabChars);
	    assert_eq(eh._len,          copy.eh()._len);
	    assert_eq(_zOff,             copy.zOff());
	    assert_eq(_zEbwtBpOff,       copy.zEbwtBpOff());
	    assert_eq(_zEbwtByteOff,     copy.zEbwtByteOff());
		assert_eq(_nPat,             copy.nPat());
		for(TIndexOffU i = 0; i < _nPat; i++)
			assert_eq(this->_plen[i], copy.plen()[i]);
		assert_eq(this->_nFrag, copy.nFrag());
		for(TIndexOffU i = 0; i < this->nFrag*3; i++) {
			assert_eq(this->_rstarts[i], copy.rstarts()[i]);
		}
		for(uint32_t i = 0; i < 5; i++)
			assert_eq(this->_fchr[i], copy.fchr()[i]);
		for(TIndexOffU i = 0; i < eh._ftabLen; i++)
			assert_eq(this->ftab()[i], copy.ftab()[i]);
		for(TIndexOffU i = 0; i < eh._eftabLen; i++)
			assert_eq(this->eftab()[i], copy.eftab()[i]);
		for(TIndexOffU i = 0; i < eh._offsLen; i++)
			assert_eq(this->_offs[i], copy.offs()[i]);
		for(TIndexOffU i = 0; i < eh._isaLen; i++)
			assert_eq(this->_isa[i], copy.isa()[i]);
		for(TIndexOffU i = 0; i < eh._ebwtTotLen; i++)
			assert_eq(this->ebwt()[i], copy.ebwt()[i]);
		//copy.sanityCheckAll();
		if(_verbose)
			cout << "Read-in check passed for \"" << out1 << "\"/\"" << out2 << "\"" << endl;
	}
}

///////////////////////////////////////////////////////////////////////
//
// Functions for building Ebwts
//
///////////////////////////////////////////////////////////////////////

/**
 * Join several text strings together in a way that's compatible with
 * the text-chunking scheme dictated by chunkRate parameter.
 *
 * The non-static member Ebwt::join additionally builds auxilliary
 * arrays that maintain a mapping between chunks in the joined string
 * and the original text strings.
 */
template<typename TStr>
TStr Ebwt<TStr>::join(vector<TStr>& l, uint32_t seed) {
	RandomSource rand; // reproducible given same seed
	rand.init(seed);
	TStr ret;
	size_t guessLen = 0;
	for(size_t i = 0; i < l.size(); i++) {
		guessLen += length(l[i]);
	}
	reserve(ret, guessLen, Exact());
	for(size_t i = 0; i < l.size(); i++) {
		TStr& s = l[i];
		assert_gt(length(s), 0);
		append(ret, s);
	}
	return ret;
}

/**
 * Join several text strings together in a way that's compatible with
 * the text-chunking scheme dictated by chunkRate parameter.
 *
 * The non-static member Ebwt::join additionally builds auxilliary
 * arrays that maintain a mapping between chunks in the joined string
 * and the original text strings.
 */
template<typename TStr>
TStr Ebwt<TStr>::join(vector<FileBuf*>& l,
                      vector<RefRecord>& szs,
                      TIndexOffU sztot,
                      const RefReadInParams& refparams,
                      uint32_t seed)
{
	RandomSource rand; // reproducible given same seed
	rand.init(seed);
	RefReadInParams rpcp = refparams;
	TStr ret;
	size_t guessLen = sztot;
	reserve(ret, guessLen, Exact());
	ASSERT_ONLY(size_t szsi = 0);
	for(size_t i = 0; i < l.size(); i++) {
		// For each sequence we can pull out of istream l[i]...
		assert(!l[i]->eof());
		bool first = true;
		while(!l[i]->eof()) {
			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp);
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
			if(rec.first && rec.len == 0) rec.first = false;
#endif
			first = false;
			size_t bases = rec.len;
			assert_eq(rec.off, szs[szsi].off);
			assert_eq(rec.len, szs[szsi].len);
			assert_eq(rec.first, szs[szsi].first);
			ASSERT_ONLY(szsi++);
			if(bases == 0) continue;
		}
	}
	return ret;
}

/**
 * Join several text strings together according to the text-chunking
 * scheme specified in the EbwtParams.  Ebwt fields calculated in this
 * function are written directly to disk.
 *
 * It is assumed, but not required, that the header values have already
 * been written to 'out1' before this function is called.
 *
 * The static member Ebwt::join just returns a joined version of a
 * list of strings without building any of the auxiliary arrays.
 * Because the pseudo-random number generator is the same, we expect
 * this function and the static function to give the same result given
 * the same seed.
 */
template<typename TStr>
void Ebwt<TStr>::joinToDisk(
	vector<FileBuf*>& l,
	vector<RefRecord>& szs,
	vector<uint32_t>& plens,
	TIndexOffU sztot,
	const RefReadInParams& refparams,
	TStr& ret,
	ostream& out1,
	ostream& out2,
	uint32_t seed)
{
	RandomSource rand; // reproducible given same seed
	rand.init(seed);
	RefReadInParams rpcp = refparams;
	assert_gt(szs.size(), 0);
	assert_gt(l.size(), 0);
	assert_gt(sztot, 0);
	// Not every fragment represents a distinct sequence - many
	// fragments may correspond to a single sequence.  Count the
	// number of sequences here by counting the number of "first"
	// fragments.
	this->_nPat = 0;
	this->_nFrag = 0;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
	int nGapFrag = 0;
#endif
	for(size_t i = 0; i < szs.size(); i++) {
		if(szs[i].len > 0) this->_nFrag++;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
		if(szs[i].len == 0 && szs[i].off > 0) nGapFrag++;
		if(szs[i].first && szs[i].len > 0) this->_nPat++;
#else
		// For all records where len=0 and first=1, set first=0
		assert(szs[i].len > 0 || !szs[i].first);
		if(szs[i].first) this->_nPat++;
#endif
	}
	assert_gt(this->_nPat, 0);
	assert_geq(this->_nFrag, this->_nPat);
	this->_rstarts = NULL;
	writeU<TIndexOffU>(out1, this->_nPat, this->toBe());
	assert_eq(plens.size(), this->_nPat);
	// Allocate plen[]
	try {
		this->_plen = new TIndexOffU[this->_nPat];
	} catch(bad_alloc& e) {
		cerr << "Out of memory allocating plen[] in Ebwt::join()"
		     << " at " << __FILE__ << ":" << __LINE__ << endl;
		throw e;
	}
	// For each pattern, set plen
	for(TIndexOffU i = 0; i < plens.size(); i++) {
		this->_plen[i] = plens[i];
		writeU<TIndexOffU>(out1, this->_plen[i], this->toBe());
	}
	// Write the number of fragments
	writeU<TIndexOffU>(out1, this->_nFrag, this->toBe());
	TIndexOffU seqsRead = 0;
	ASSERT_ONLY(TIndexOffU szsi = 0);
	ASSERT_ONLY(TIndexOffU entsWritten = 0);
	// For each filebuf
	for(unsigned int i = 0; i < l.size(); i++) {
		assert(!l[i]->eof());
		bool first = true;
		TIndexOffU patoff = 0;
		// For each *fragment* (not necessary an entire sequence) we
		// can pull out of istream l[i]...
		while(!l[i]->eof()) {
			string name;
			// Push a new name onto our vector
			_refnames.push_back("");
			//uint32_t oldRetLen = length(ret);
			RefRecord rec = fastaRefReadAppend(*l[i], first, ret, rpcp, &_refnames.back());
#ifndef ACCOUNT_FOR_ALL_GAP_REFS
			if(rec.first && rec.len == 0) rec.first = false;
#endif
			first = false;
			if(rec.first) {
				if(_refnames.back().length() == 0) {
					// If name was empty, replace with an index
					ostringstream stm;
					stm << (_refnames.size()-1);
					_refnames.back() = stm.str();
				}
			} else {
				// This record didn't actually start a new sequence so
				// no need to add a name
				//assert_eq(0, _refnames.back().length());
				_refnames.pop_back();
			}
			assert_lt(szsi, szs.size());
			assert(szs[szsi].first == 0 || szs[szsi].first == 1);
			assert_eq(rec.off, szs[szsi].off);
			assert_eq(rec.len, szs[szsi].len);
			// szs[szsi].first == 2 sometimes?!?!  g++ is unable to do
			// the following correctly, regardless of how I write it
			//assert((rec.first == 0) == (szs[szsi].first == 0));
			assert(rec.first || rec.off > 0);
			ASSERT_ONLY(szsi++);
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
			if(rec.len == 0) continue;
			if(rec.first && rec.len > 0) seqsRead++;
			assert_leq(rec.len, this->_plen[seqsRead-1]);
#else
			if(rec.first) seqsRead++;
			if(rec.len == 0) continue;
			assert_leq(rec.len, this->_plen[seqsRead-1]);
#endif
			// Reset the patoff if this is the first fragment
			if(rec.first) patoff = 0;
			patoff += rec.off; // add fragment's offset from end of last frag.
			// Adjust rpcps
			//uint32_t seq = seqsRead-1;
			ASSERT_ONLY(entsWritten++);
			// This is where rstarts elements are written to the output stream
			//writeU32(out1, oldRetLen, this->toBe()); // offset from beginning of joined string
			//writeU32(out1, seq,       this->toBe()); // sequence id
			//writeU32(out1, patoff,    this->toBe()); // offset into sequence
			patoff += rec.len;
		}
		assert_gt(szsi, 0);
		l[i]->reset();
		assert(!l[i]->eof());
		#ifndef NDEBUG
		int c = l[i]->get();
		assert_eq('>', c);
		assert(!l[i]->eof());
		l[i]->reset();
		assert(!l[i]->eof());
		#endif
	}
	assert_eq(entsWritten, this->_nFrag);
}


/**
 * Build an Ebwt from a string 's' and its suffix array 'sa' (which
 * might actually be a suffix array *builder* that builds blocks of the
 * array on demand).  The bulk of the Ebwt, i.e. the ebwt and offs
 * arrays, is written directly to disk.  This is by design: keeping
 * those arrays in memory needlessly increases the footprint of the
 * building process.  Instead, we prefer to build the Ebwt directly
 * "to disk" and then read it back into memory later as necessary.
 *
 * It is assumed that the header values and join-related values (nPat,
 * plen) have already been written to 'out1' before this function
 * is called.  When this function is finished, it will have
 * additionally written ebwt, zOff, fchr, ftab and eftab to the primary
 * file and offs to the secondary file.
 *
 * Assume DNA/RNA/any alphabet with 4 or fewer elements.
 * Assume occ array entries are 32 bits each.
 *
 * @param sa            the suffix array to convert to a Ebwt
 * @param buildISA      whether to output an ISA sample into out2 after
 *                      the SA sample
 * @param s             the original string
 * @param out
 */
template<typename TStr>
void Ebwt<TStr>::buildToDisk(InorderBlockwiseSA<TStr>& sa,
                             const TStr& s,
                             ostream& out1,
                             ostream& out2)
{
	const EbwtParams& eh = this->_eh;

	assert(eh.repOk());
	assert_eq(length(s)+1, sa.size());
	assert_eq(length(s), eh._len);
	assert_gt(eh._lineRate, 3);
	assert(sa.suffixItrIsReset());
	assert_leq((int)ValueSize<Dna>::VALUE, 4);

	TIndexOffU  len = eh._len;
	TIndexOffU  ftabLen = eh._ftabLen;
	TIndexOffU  sideSz = eh._sideSz;
	TIndexOffU  ebwtTotSz = eh._ebwtTotSz;
	TIndexOffU  fchr[] = {0, 0, 0, 0, 0};
	TIndexOffU* ftab = NULL;
	TIndexOffU  zOff = OFF_MASK;

	// Save # of occurrences of each character as we walk along the bwt
	TIndexOffU occ[4] = {0, 0, 0, 0};
	// Save 'G' and 'T' occurrences between backward and forward buckets
	TIndexOffU occSave[2] = {0, 0};

	// Record rows that should "absorb" adjacent rows in the ftab.
	// The absorbed rows represent suffixes shorter than the ftabChars
	// cutoff.
	uint8_t absorbCnt = 0;
	uint8_t *absorbFtab;
	try {
		VMSG_NL("Allocating ftab, absorbFtab");
		ftab = new TIndexOffU[ftabLen];
		memset(ftab, 0, OFF_SIZE * ftabLen);
		absorbFtab = new uint8_t[ftabLen];
		memset(absorbFtab, 0, ftabLen);
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating ftab[] or absorbFtab[] "
		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}
	assert(ftab != NULL);
	assert(absorbFtab != NULL);

	// Allocate the side buffer; holds a single side as its being
	// constructed and then written to disk.  Reused across all sides.
#ifdef SIXTY4_FORMAT
	uint64_t *ebwtSide = NULL;
#else
	uint8_t *ebwtSide = NULL;
#endif
	try {
#ifdef SIXTY4_FORMAT
		ebwtSide = new uint64_t[sideSz >> 3];
#else
		ebwtSide = new uint8_t[sideSz];
#endif
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating ebwtSide[] in "
		     << "Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}
	assert(ebwtSide != NULL);

	// Allocate a buffer to hold the ISA sample, which we accumulate in
	// the loop and then output at the end.  We can't write output the
	// ISA right away because the order in which we calculate its
	// elements is based on the suffix array, which we only see bit by
	// bit
	uint32_t *isaSample = NULL;
	if(eh._isaRate >= 0) {
		try {
			isaSample = new uint32_t[eh._isaLen];
		} catch(bad_alloc &e) {
			cerr << "Out of memory allocating isaSample[] in "
			     << "Ebwt::buildToDisk() at " << __FILE__ << ":"
			     << __LINE__ << endl;
			throw e;
		}
		assert(isaSample != NULL);
	}

	// Points to the base offset within ebwt for the side currently
	// being written
	TIndexOffU side = 0;
	// Points to a byte offset from 'side' within ebwt[] where next
	// char should be written
#ifdef SIXTY4_FORMAT
	TIndexOff sideCur = (eh._sideBwtSz >> 3) - 1;
#else
	TIndexOff sideCur = eh._sideBwtSz - 1;
#endif

	// Whether we're assembling a forward or a reverse bucket
	bool fw = false;

	// Did we just finish writing a forward bucket?  (Must be true when
	// we exit the loop.)
	ASSERT_ONLY(bool wroteFwBucket = false);

	// Have we skipped the '$' in the last column yet?
	ASSERT_ONLY(bool dollarSkipped = false);

	TIndexOffU si = 0;   // string offset (chars)
	ASSERT_ONLY(TIndexOffU lastSufInt = 0);
	ASSERT_ONLY(bool inSA = true); // true iff saI still points inside suffix
	                               // array (as opposed to the padding at the
	                               // end)
	// Iterate over packed bwt bytes
	VMSG_NL("Entering Ebwt loop");
	ASSERT_ONLY(TIndexOffU beforeEbwtOff = (uint32_t)out1.tellp());
	while(side < ebwtTotSz) {
		ASSERT_ONLY(wroteFwBucket = false);
		// Sanity-check our cursor into the side buffer
		assert_geq(sideCur, 0);
		assert_lt(sideCur, (int)eh._sideBwtSz);
		assert_eq(0, side % sideSz); // 'side' must be on side boundary
		ebwtSide[sideCur] = 0; // clear
		assert_lt(side + sideCur, ebwtTotSz);
		// Iterate over bit-pairs in the si'th character of the BWT
#ifdef SIXTY4_FORMAT
		for(int bpi = 0; bpi < 32; bpi++, si++)
#else
		for(int bpi = 0; bpi < 4; bpi++, si++)
#endif
		{
			int bwtChar;
			bool count = true;
			if(si <= len) {
				// Still in the SA; extract the bwtChar
				TIndexOffU saElt = sa.nextSuffix();
				// (that might have triggered sa to calc next suf block)
				if(isaSample != NULL && (saElt & eh._isaMask) == saElt) {
					// This element belongs in the ISA sample.  Add
					// an entry mapping the text offset to the offset
					// into the suffix array that holds the suffix
					// beginning with the character at that text offset
					assert_lt((saElt >> eh._isaRate), eh._isaLen);
					isaSample[saElt >> eh._isaRate] = si;
				}
				if(saElt == 0) {
					// Don't add the '$' in the last column to the BWT
					// transform; we can't encode a $ (only A C T or G)
					// and counting it as, say, an A, will mess up the
					// LR mapping
					bwtChar = 0; count = false;
					ASSERT_ONLY(dollarSkipped = true);
					zOff = si; // remember the SA row that
					           // corresponds to the 0th suffix
				} else {
					bwtChar = (int)(Dna)(s[saElt-1]);
					assert_lt(bwtChar, 4);
					// Update the fchr
					fchr[bwtChar]++;
				}
				// Update ftab
				if((len-saElt) >= (TIndexOffU)eh._ftabChars) {
					// Turn the first ftabChars characters of the
					// suffix into an integer index into ftab
					TIndexOffU sufInt = 0;
					for(int i = 0; i < eh._ftabChars; i++) {
						sufInt <<= 2;
						assert_lt((TIndexOffU)i, len-saElt);
						sufInt |= (unsigned char)(Dna)(s[saElt+i]);
					}
					// Assert that this prefix-of-suffix is greater
					// than or equal to the last one (true b/c the
					// suffix array is sorted)
					#ifndef NDEBUG
					if(lastSufInt > 0) assert_geq(sufInt, lastSufInt);
					lastSufInt = sufInt;
					#endif
					// Update ftab
					assert_lt(sufInt+1, ftabLen);
					ftab[sufInt+1]++;
					if(absorbCnt > 0) {
						// Absorb all short suffixes since the last
						// transition into this transition
						absorbFtab[sufInt] = absorbCnt;
						absorbCnt = 0;
					}
				} else {
					// Otherwise if suffix is fewer than ftabChars
					// characters long, then add it to the 'absorbCnt';
					// it will be absorbed into the next transition
					assert_lt(absorbCnt, 255);
					absorbCnt++;
				}
				// Suffix array offset boundary? - update offset array
				if((si & eh._offMask) == si) {
					assert_lt((si >> eh._offRate), eh._offsLen);
					// Write offsets directly to the secondary output
					// stream, thereby avoiding keeping them in memory
					writeU<TIndexOffU>(out2, saElt, this->toBe());
				}
			} else {
				// Strayed off the end of the SA, now we're just
				// padding out a bucket
				#ifndef NDEBUG
				if(inSA) {
					// Assert that we wrote all the characters in the
					// string before now
					assert_eq(si, len+1);
					inSA = false;
				}
				#endif
				// 'A' used for padding; important that padding be
				// counted in the occ[] array
				bwtChar = 0;
			}
			if(count) occ[bwtChar]++;
			// Append BWT char to bwt section of current side
			if(fw) {
				// Forward bucket: fill from least to most
#ifdef SIXTY4_FORMAT
				ebwtSide[sideCur] |= ((uint64_t)bwtChar << (bpi << 1));
				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], bpi);
				assert_eq((ebwtSide[sideCur] >> (bpi*2)) & 3, bwtChar);
#endif
			} else {
				// Backward bucket: fill from most to least
#ifdef SIXTY4_FORMAT
				ebwtSide[sideCur] |= ((uint64_t)bwtChar << ((31 - bpi) << 1));
				if(bwtChar > 0) assert_gt(ebwtSide[sideCur], 0);
#else
				pack_2b_in_8b(bwtChar, ebwtSide[sideCur], 3-bpi);
				assert_eq((ebwtSide[sideCur] >> ((3-bpi)*2)) & 3, bwtChar);
#endif
			}
		} // end loop over bit-pairs
		assert_eq(dollarSkipped ? 3 : 0, (occ[0] + occ[1] + occ[2] + occ[3]) & 3);
#ifdef SIXTY4_FORMAT
		assert_eq(0, si & 31);
#else
		assert_eq(0, si & 3);
#endif
		if(fw) sideCur++;
		else   sideCur--;
#ifdef SIXTY4_FORMAT
		if(sideCur == (int)eh._sideBwtSz >> 3)
#else
		if(sideCur == (int)eh._sideBwtSz)
#endif
		{
			// Forward side boundary
			assert_eq(0, si % eh._sideBwtLen);
#ifdef SIXTY4_FORMAT
			sideCur = (eh._sideBwtSz >> 3) - 1;
#else
			sideCur = eh._sideBwtSz - 1;
#endif
			assert(fw); fw = false;
			ASSERT_ONLY(wroteFwBucket = true);
			// Write 'G' and 'T'
			assert_leq(occSave[0], occ[2]);
			assert_leq(occSave[1], occ[3]);
			TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide);
			side += sideSz;
			assert_leq(side, eh._ebwtTotSz);
#ifdef BOWTIE_64BIT_INDEX
			u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
			u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
#else
			u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occSave[0], this->toBe());
			u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occSave[1], this->toBe());
#endif
			// Write forward side to primary file
			out1.write((const char *)ebwtSide, sideSz);
		} else if (sideCur == -1) {
			// Backward side boundary
			assert_eq(0, si % eh._sideBwtLen);
			sideCur = 0;
			assert(!fw); fw = true;
			// Write 'A' and 'C'
			TIndexOffU *u32side = reinterpret_cast<TIndexOffU*>(ebwtSide);
			side += sideSz;
			assert_leq(side, eh._ebwtTotSz);
#ifdef BOWTIE_64BIT_INDEX
			u32side[(sideSz >> 3)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe());
			u32side[(sideSz >> 3)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe());
#else
			u32side[(sideSz >> 2)-2] = endianizeU<TIndexOffU>(occ[0], this->toBe());
			u32side[(sideSz >> 2)-1] = endianizeU<TIndexOffU>(occ[1], this->toBe());
#endif
			occSave[0] = occ[2]; // save 'G' count
			occSave[1] = occ[3]; // save 'T' count
			// Write backward side to primary file
			out1.write((const char *)ebwtSide, sideSz);
		}
	}
	VMSG_NL("Exited Ebwt loop");
	assert(ftab != NULL);
	assert_neq(zOff, OFF_MASK);
	if(absorbCnt > 0) {
		// Absorb any trailing, as-yet-unabsorbed short suffixes into
		// the last element of ftab
		absorbFtab[ftabLen-1] = absorbCnt;
	}
	// Assert that our loop counter got incremented right to the end
	assert_eq(side, eh._ebwtTotSz);
	// Assert that we wrote the expected amount to out1
	assert_eq(((TIndexOffU)out1.tellp() - beforeEbwtOff), eh._ebwtTotSz);
	// assert that the last thing we did was write a forward bucket
	assert(wroteFwBucket);

	//
	// Write zOff to primary stream
	//
	writeU<TIndexOffU>(out1, zOff, this->toBe());

	//
	// Finish building fchr
	//
	// Exclusive prefix sum on fchr
	for(int i = 1; i < 4; i++) {
		fchr[i] += fchr[i-1];
	}
	assert_eq(fchr[3], len);
	// Shift everybody up by one
	for(int i = 4; i >= 1; i--) {
		fchr[i] = fchr[i-1];
	}
	fchr[0] = 0;
	if(_verbose) {
		for(int i = 0; i < 5; i++)
			cout << "fchr[" << "ACGT$"[i] << "]: " << fchr[i] << endl;
	}
	// Write fchr to primary file
	for(int i = 0; i < 5; i++) {
		writeU<TIndexOffU>(out1, fchr[i], this->toBe());
	}

	//
	// Finish building ftab and build eftab
	//
	// Prefix sum on ftable
	TIndexOffU eftabLen = 0;
	assert_eq(0, absorbFtab[0]);
	for(TIndexOffU i = 1; i < ftabLen; i++) {
		if(absorbFtab[i] > 0) eftabLen += 2;
	}
	assert_leq(eftabLen, (TIndexOffU)eh._ftabChars*2);
	eftabLen = eh._ftabChars*2;
	TIndexOffU *eftab = NULL;
	try {
		eftab = new TIndexOffU[eftabLen];
		memset(eftab, 0, OFF_SIZE * eftabLen);
	} catch(bad_alloc &e) {
		cerr << "Out of memory allocating eftab[] "
		     << "in Ebwt::buildToDisk() at " << __FILE__ << ":"
		     << __LINE__ << endl;
		throw e;
	}
	assert(eftab != NULL);
	TIndexOffU eftabCur = 0;
	for(TIndexOffU i = 1; i < ftabLen; i++) {
		TIndexOffU lo = ftab[i] + Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i-1);
		if(absorbFtab[i] > 0) {
			// Skip a number of short pattern indicated by absorbFtab[i]
			TIndexOffU hi = lo + absorbFtab[i];
			assert_lt(eftabCur*2+1, eftabLen);
			eftab[eftabCur*2] = lo;
			eftab[eftabCur*2+1] = hi;
			ftab[i] = (eftabCur++) ^ OFF_MASK; // insert pointer into eftab
			assert_eq(lo, Ebwt::ftabLo(ftab, eftab, len, ftabLen, eftabLen, i));
			assert_eq(hi, Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, i));
		} else {
			ftab[i] = lo;
		}
	}
	assert_eq(Ebwt::ftabHi(ftab, eftab, len, ftabLen, eftabLen, ftabLen-1), len+1);
	// Write ftab to primary file
	for(TIndexOffU i = 0; i < ftabLen; i++) {
		writeU<TIndexOffU>(out1, ftab[i], this->toBe());
	}
	// Write eftab to primary file
	for(TIndexOffU i = 0; i < eftabLen; i++) {
		writeU<TIndexOffU>(out1, eftab[i], this->toBe());
	}
	// Write isa to primary file
	if(isaSample != NULL) {
		ASSERT_ONLY(Bitset sawISA(eh._len+1));
		for(TIndexOffU i = 0; i < eh._isaLen; i++) {
			TIndexOffU s = isaSample[i];
			assert_leq(s, eh._len);
			assert(!sawISA.test(s));
			ASSERT_ONLY(sawISA.set(s));
			writeU<TIndexOffU>(out2, s, this->toBe());
		}
		delete[] isaSample;
	}
	delete[] ftab;
	delete[] eftab;
	delete[] absorbFtab;

	// Note: if you'd like to sanity-check the Ebwt, you'll have to
	// read it back into memory first!
	assert(!isInMemory());
	VMSG_NL("Exiting Ebwt::buildToDisk()");
}

/**
 * Try to find the Bowtie index specified by the user.  First try the
 * exact path given by the user.  Then try the user-provided string
 * appended onto the path of the "indexes" subdirectory below this
 * executable, then try the provided string appended onto
 * "$BOWTIE_INDEXES/".
 */
string adjustEbwtBase(const string& cmdline,
					  const string& ebwtFileBase,
					  bool verbose = false);

#endif /*EBWT_H_*/
